tgsi_exec.c revision 4c0f1fb5ec6117f07c9c911d7f74ff0d18c51d98
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 0
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_sgt(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src0,
380          const union tgsi_exec_channel *src1)
381{
382   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
383   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
384   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
385   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
386}
387
388static void
389micro_sin(union tgsi_exec_channel *dst,
390          const union tgsi_exec_channel *src)
391{
392   dst->f[0] = sinf(src->f[0]);
393   dst->f[1] = sinf(src->f[1]);
394   dst->f[2] = sinf(src->f[2]);
395   dst->f[3] = sinf(src->f[3]);
396}
397
398static void
399micro_sle(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src0,
401          const union tgsi_exec_channel *src1)
402{
403   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
404   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
405   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
406   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
407}
408
409static void
410micro_slt(union tgsi_exec_channel *dst,
411          const union tgsi_exec_channel *src0,
412          const union tgsi_exec_channel *src1)
413{
414   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
415   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
416   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
417   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
418}
419
420static void
421micro_sne(union tgsi_exec_channel *dst,
422          const union tgsi_exec_channel *src0,
423          const union tgsi_exec_channel *src1)
424{
425   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
426   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
427   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
428   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
429}
430
431static void
432micro_sfl(union tgsi_exec_channel *dst)
433{
434   dst->f[0] = 0.0f;
435   dst->f[1] = 0.0f;
436   dst->f[2] = 0.0f;
437   dst->f[3] = 0.0f;
438}
439
440static void
441micro_str(union tgsi_exec_channel *dst)
442{
443   dst->f[0] = 1.0f;
444   dst->f[1] = 1.0f;
445   dst->f[2] = 1.0f;
446   dst->f[3] = 1.0f;
447}
448
449static void
450micro_trunc(union tgsi_exec_channel *dst,
451            const union tgsi_exec_channel *src)
452{
453   dst->f[0] = (float)(int)src->f[0];
454   dst->f[1] = (float)(int)src->f[1];
455   dst->f[2] = (float)(int)src->f[2];
456   dst->f[3] = (float)(int)src->f[3];
457}
458
459
460#define CHAN_X  0
461#define CHAN_Y  1
462#define CHAN_Z  2
463#define CHAN_W  3
464
465enum tgsi_exec_datatype {
466   TGSI_EXEC_DATA_FLOAT,
467   TGSI_EXEC_DATA_INT,
468   TGSI_EXEC_DATA_UINT
469};
470
471/*
472 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
473 */
474#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
475#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
476#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
477#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
478#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
479#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
480
481
482/** The execution mask depends on the conditional mask and the loop mask */
483#define UPDATE_EXEC_MASK(MACH) \
484      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
485
486
487static const union tgsi_exec_channel ZeroVec =
488   { { 0.0, 0.0, 0.0, 0.0 } };
489
490static const union tgsi_exec_channel OneVec = {
491   {1.0f, 1.0f, 1.0f, 1.0f}
492};
493
494static const union tgsi_exec_channel P128Vec = {
495   {128.0f, 128.0f, 128.0f, 128.0f}
496};
497
498static const union tgsi_exec_channel M128Vec = {
499   {-128.0f, -128.0f, -128.0f, -128.0f}
500};
501
502
503/**
504 * Assert that none of the float values in 'chan' are infinite or NaN.
505 * NaN and Inf may occur normally during program execution and should
506 * not lead to crashes, etc.  But when debugging, it's helpful to catch
507 * them.
508 */
509static INLINE void
510check_inf_or_nan(const union tgsi_exec_channel *chan)
511{
512   assert(!util_is_inf_or_nan((chan)->f[0]));
513   assert(!util_is_inf_or_nan((chan)->f[1]));
514   assert(!util_is_inf_or_nan((chan)->f[2]));
515   assert(!util_is_inf_or_nan((chan)->f[3]));
516}
517
518
519#ifdef DEBUG
520static void
521print_chan(const char *msg, const union tgsi_exec_channel *chan)
522{
523   debug_printf("%s = {%f, %f, %f, %f}\n",
524                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
525}
526#endif
527
528
529#ifdef DEBUG
530static void
531print_temp(const struct tgsi_exec_machine *mach, uint index)
532{
533   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
534   int i;
535   debug_printf("Temp[%u] =\n", index);
536   for (i = 0; i < 4; i++) {
537      debug_printf("  %c: { %f, %f, %f, %f }\n",
538                   "XYZW"[i],
539                   tmp->xyzw[i].f[0],
540                   tmp->xyzw[i].f[1],
541                   tmp->xyzw[i].f[2],
542                   tmp->xyzw[i].f[3]);
543   }
544}
545#endif
546
547
548void
549tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
550                               unsigned num_bufs,
551                               const void **bufs,
552                               const unsigned *buf_sizes)
553{
554   unsigned i;
555
556   for (i = 0; i < num_bufs; i++) {
557      mach->Consts[i] = bufs[i];
558      mach->ConstsSize[i] = buf_sizes[i];
559   }
560}
561
562
563/**
564 * Check if there's a potential src/dst register data dependency when
565 * using SOA execution.
566 * Example:
567 *   MOV T, T.yxwz;
568 * This would expand into:
569 *   MOV t0, t1;
570 *   MOV t1, t0;
571 *   MOV t2, t3;
572 *   MOV t3, t2;
573 * The second instruction will have the wrong value for t0 if executed as-is.
574 */
575boolean
576tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
577{
578   uint i, chan;
579
580   uint writemask = inst->Dst[0].Register.WriteMask;
581   if (writemask == TGSI_WRITEMASK_X ||
582       writemask == TGSI_WRITEMASK_Y ||
583       writemask == TGSI_WRITEMASK_Z ||
584       writemask == TGSI_WRITEMASK_W ||
585       writemask == TGSI_WRITEMASK_NONE) {
586      /* no chance of data dependency */
587      return FALSE;
588   }
589
590   /* loop over src regs */
591   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
592      if ((inst->Src[i].Register.File ==
593           inst->Dst[0].Register.File) &&
594          ((inst->Src[i].Register.Index ==
595            inst->Dst[0].Register.Index) ||
596           inst->Src[i].Register.Indirect ||
597           inst->Dst[0].Register.Indirect)) {
598         /* loop over dest channels */
599         uint channelsWritten = 0x0;
600         for (chan = 0; chan < NUM_CHANNELS; chan++) {
601            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
602               /* check if we're reading a channel that's been written */
603               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
604               if (channelsWritten & (1 << swizzle)) {
605                  return TRUE;
606               }
607
608               channelsWritten |= (1 << chan);
609            }
610         }
611      }
612   }
613   return FALSE;
614}
615
616
617/**
618 * Initialize machine state by expanding tokens to full instructions,
619 * allocating temporary storage, setting up constants, etc.
620 * After this, we can call tgsi_exec_machine_run() many times.
621 */
622void
623tgsi_exec_machine_bind_shader(
624   struct tgsi_exec_machine *mach,
625   const struct tgsi_token *tokens,
626   uint numSamplers,
627   struct tgsi_sampler **samplers)
628{
629   uint k;
630   struct tgsi_parse_context parse;
631   struct tgsi_full_instruction *instructions;
632   struct tgsi_full_declaration *declarations;
633   uint maxInstructions = 10, numInstructions = 0;
634   uint maxDeclarations = 10, numDeclarations = 0;
635
636#if 0
637   tgsi_dump(tokens, 0);
638#endif
639
640   util_init_math();
641
642   if (numSamplers) {
643      assert(samplers);
644   }
645
646   mach->Tokens = tokens;
647   mach->Samplers = samplers;
648
649   if (!tokens) {
650      /* unbind and free all */
651      if (mach->Declarations) {
652         FREE( mach->Declarations );
653      }
654      mach->Declarations = NULL;
655      mach->NumDeclarations = 0;
656
657      if (mach->Instructions) {
658         FREE( mach->Instructions );
659      }
660      mach->Instructions = NULL;
661      mach->NumInstructions = 0;
662
663      return;
664   }
665
666   k = tgsi_parse_init (&parse, mach->Tokens);
667   if (k != TGSI_PARSE_OK) {
668      debug_printf( "Problem parsing!\n" );
669      return;
670   }
671
672   mach->Processor = parse.FullHeader.Processor.Processor;
673   mach->ImmLimit = 0;
674
675   if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
676       !mach->UsedGeometryShader) {
677      struct tgsi_exec_vector *inputs;
678      struct tgsi_exec_vector *outputs;
679
680      inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
681                            TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
682                            16);
683
684      if (!inputs)
685         return;
686
687      outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
688                             TGSI_MAX_TOTAL_VERTICES, 16);
689
690      if (!outputs) {
691         align_free(inputs);
692         return;
693      }
694
695      align_free(mach->Inputs);
696      align_free(mach->Outputs);
697
698      mach->Inputs = inputs;
699      mach->Outputs = outputs;
700      mach->UsedGeometryShader = TRUE;
701   }
702
703   declarations = (struct tgsi_full_declaration *)
704      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
705
706   if (!declarations) {
707      return;
708   }
709
710   instructions = (struct tgsi_full_instruction *)
711      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
712
713   if (!instructions) {
714      FREE( declarations );
715      return;
716   }
717
718   while( !tgsi_parse_end_of_tokens( &parse ) ) {
719      uint i;
720
721      tgsi_parse_token( &parse );
722      switch( parse.FullToken.Token.Type ) {
723      case TGSI_TOKEN_TYPE_DECLARATION:
724         /* save expanded declaration */
725         if (numDeclarations == maxDeclarations) {
726            declarations = REALLOC(declarations,
727                                   maxDeclarations
728                                   * sizeof(struct tgsi_full_declaration),
729                                   (maxDeclarations + 10)
730                                   * sizeof(struct tgsi_full_declaration));
731            maxDeclarations += 10;
732         }
733         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
734            unsigned reg;
735            for (reg = parse.FullToken.FullDeclaration.Range.First;
736                 reg <= parse.FullToken.FullDeclaration.Range.Last;
737                 ++reg) {
738               ++mach->NumOutputs;
739            }
740         }
741         if (parse.FullToken.FullDeclaration.Declaration.File ==
742             TGSI_FILE_IMMEDIATE_ARRAY) {
743            unsigned reg;
744            struct tgsi_full_declaration *decl =
745               &parse.FullToken.FullDeclaration;
746            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
747            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
748               for( i = 0; i < 4; i++ ) {
749                  int idx = reg * 4 + i;
750                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
751               }
752            }
753         }
754         memcpy(declarations + numDeclarations,
755                &parse.FullToken.FullDeclaration,
756                sizeof(declarations[0]));
757         numDeclarations++;
758         break;
759
760      case TGSI_TOKEN_TYPE_IMMEDIATE:
761         {
762            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
763            assert( size <= 4 );
764            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
765
766            for( i = 0; i < size; i++ ) {
767               mach->Imms[mach->ImmLimit][i] =
768		  parse.FullToken.FullImmediate.u[i].Float;
769            }
770            mach->ImmLimit += 1;
771         }
772         break;
773
774      case TGSI_TOKEN_TYPE_INSTRUCTION:
775
776         /* save expanded instruction */
777         if (numInstructions == maxInstructions) {
778            instructions = REALLOC(instructions,
779                                   maxInstructions
780                                   * sizeof(struct tgsi_full_instruction),
781                                   (maxInstructions + 10)
782                                   * sizeof(struct tgsi_full_instruction));
783            maxInstructions += 10;
784         }
785
786         memcpy(instructions + numInstructions,
787                &parse.FullToken.FullInstruction,
788                sizeof(instructions[0]));
789
790         numInstructions++;
791         break;
792
793      case TGSI_TOKEN_TYPE_PROPERTY:
794         break;
795
796      default:
797         assert( 0 );
798      }
799   }
800   tgsi_parse_free (&parse);
801
802   if (mach->Declarations) {
803      FREE( mach->Declarations );
804   }
805   mach->Declarations = declarations;
806   mach->NumDeclarations = numDeclarations;
807
808   if (mach->Instructions) {
809      FREE( mach->Instructions );
810   }
811   mach->Instructions = instructions;
812   mach->NumInstructions = numInstructions;
813}
814
815
816struct tgsi_exec_machine *
817tgsi_exec_machine_create( void )
818{
819   struct tgsi_exec_machine *mach;
820   uint i;
821
822   mach = align_malloc( sizeof *mach, 16 );
823   if (!mach)
824      goto fail;
825
826   memset(mach, 0, sizeof(*mach));
827
828   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
829   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
830   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
831
832   mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
833   mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
834   if (!mach->Inputs || !mach->Outputs)
835      goto fail;
836
837   /* Setup constants needed by the SSE2 executor. */
838   for( i = 0; i < 4; i++ ) {
839      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
840      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
841      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
842      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
843      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
844      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
845      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
846      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
847      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
848      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
849   }
850
851#ifdef DEBUG
852   /* silence warnings */
853   (void) print_chan;
854   (void) print_temp;
855#endif
856
857   return mach;
858
859fail:
860   if (mach) {
861      align_free(mach->Inputs);
862      align_free(mach->Outputs);
863      align_free(mach);
864   }
865   return NULL;
866}
867
868
869void
870tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
871{
872   if (mach) {
873      if (mach->Instructions)
874         FREE(mach->Instructions);
875      if (mach->Declarations)
876         FREE(mach->Declarations);
877
878      align_free(mach->Inputs);
879      align_free(mach->Outputs);
880
881      align_free(mach);
882   }
883}
884
885static void
886micro_add(union tgsi_exec_channel *dst,
887          const union tgsi_exec_channel *src0,
888          const union tgsi_exec_channel *src1)
889{
890   dst->f[0] = src0->f[0] + src1->f[0];
891   dst->f[1] = src0->f[1] + src1->f[1];
892   dst->f[2] = src0->f[2] + src1->f[2];
893   dst->f[3] = src0->f[3] + src1->f[3];
894}
895
896static void
897micro_div(
898   union tgsi_exec_channel *dst,
899   const union tgsi_exec_channel *src0,
900   const union tgsi_exec_channel *src1 )
901{
902   if (src1->f[0] != 0) {
903      dst->f[0] = src0->f[0] / src1->f[0];
904   }
905   if (src1->f[1] != 0) {
906      dst->f[1] = src0->f[1] / src1->f[1];
907   }
908   if (src1->f[2] != 0) {
909      dst->f[2] = src0->f[2] / src1->f[2];
910   }
911   if (src1->f[3] != 0) {
912      dst->f[3] = src0->f[3] / src1->f[3];
913   }
914}
915
916static void
917micro_rcc(union tgsi_exec_channel *dst,
918          const union tgsi_exec_channel *src)
919{
920   uint i;
921
922   for (i = 0; i < 4; i++) {
923      float recip = 1.0f / src->f[i];
924
925      if (recip > 0.0f) {
926         if (recip > 1.884467e+019f) {
927            dst->f[i] = 1.884467e+019f;
928         }
929         else if (recip < 5.42101e-020f) {
930            dst->f[i] = 5.42101e-020f;
931         }
932         else {
933            dst->f[i] = recip;
934         }
935      }
936      else {
937         if (recip < -1.884467e+019f) {
938            dst->f[i] = -1.884467e+019f;
939         }
940         else if (recip > -5.42101e-020f) {
941            dst->f[i] = -5.42101e-020f;
942         }
943         else {
944            dst->f[i] = recip;
945         }
946      }
947   }
948}
949
950static void
951micro_lt(
952   union tgsi_exec_channel *dst,
953   const union tgsi_exec_channel *src0,
954   const union tgsi_exec_channel *src1,
955   const union tgsi_exec_channel *src2,
956   const union tgsi_exec_channel *src3 )
957{
958   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
959   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
960   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
961   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
962}
963
964static void
965micro_max(union tgsi_exec_channel *dst,
966          const union tgsi_exec_channel *src0,
967          const union tgsi_exec_channel *src1)
968{
969   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
970   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
971   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
972   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
973}
974
975static void
976micro_min(union tgsi_exec_channel *dst,
977          const union tgsi_exec_channel *src0,
978          const union tgsi_exec_channel *src1)
979{
980   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
981   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
982   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
983   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
984}
985
986static void
987micro_mul(union tgsi_exec_channel *dst,
988          const union tgsi_exec_channel *src0,
989          const union tgsi_exec_channel *src1)
990{
991   dst->f[0] = src0->f[0] * src1->f[0];
992   dst->f[1] = src0->f[1] * src1->f[1];
993   dst->f[2] = src0->f[2] * src1->f[2];
994   dst->f[3] = src0->f[3] * src1->f[3];
995}
996
997static void
998micro_neg(
999   union tgsi_exec_channel *dst,
1000   const union tgsi_exec_channel *src )
1001{
1002   dst->f[0] = -src->f[0];
1003   dst->f[1] = -src->f[1];
1004   dst->f[2] = -src->f[2];
1005   dst->f[3] = -src->f[3];
1006}
1007
1008static void
1009micro_pow(
1010   union tgsi_exec_channel *dst,
1011   const union tgsi_exec_channel *src0,
1012   const union tgsi_exec_channel *src1 )
1013{
1014#if FAST_MATH
1015   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1016   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1017   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1018   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1019#else
1020   dst->f[0] = powf( src0->f[0], src1->f[0] );
1021   dst->f[1] = powf( src0->f[1], src1->f[1] );
1022   dst->f[2] = powf( src0->f[2], src1->f[2] );
1023   dst->f[3] = powf( src0->f[3], src1->f[3] );
1024#endif
1025}
1026
1027static void
1028micro_sub(union tgsi_exec_channel *dst,
1029          const union tgsi_exec_channel *src0,
1030          const union tgsi_exec_channel *src1)
1031{
1032   dst->f[0] = src0->f[0] - src1->f[0];
1033   dst->f[1] = src0->f[1] - src1->f[1];
1034   dst->f[2] = src0->f[2] - src1->f[2];
1035   dst->f[3] = src0->f[3] - src1->f[3];
1036}
1037
1038static void
1039fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1040                       const uint file,
1041                       const uint swizzle,
1042                       const union tgsi_exec_channel *index,
1043                       const union tgsi_exec_channel *index2D,
1044                       union tgsi_exec_channel *chan)
1045{
1046   uint i;
1047
1048   assert(swizzle < 4);
1049
1050   switch (file) {
1051   case TGSI_FILE_CONSTANT:
1052      for (i = 0; i < QUAD_SIZE; i++) {
1053         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1054         assert(mach->Consts[index2D->i[i]]);
1055
1056         if (index->i[i] < 0) {
1057            chan->u[i] = 0;
1058         } else {
1059            /* NOTE: copying the const value as a uint instead of float */
1060            const uint constbuf = index2D->i[i];
1061            const uint *buf = (const uint *)mach->Consts[constbuf];
1062            const int pos = index->i[i] * 4 + swizzle;
1063            /* const buffer bounds check */
1064            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1065               if (0) {
1066                  /* Debug: print warning */
1067                  static int count = 0;
1068                  if (count++ < 100)
1069                     debug_printf("TGSI Exec: const buffer index %d"
1070                                  " out of bounds\n", pos);
1071               }
1072               chan->u[i] = 0;
1073            }
1074            else
1075               chan->u[i] = buf[pos];
1076         }
1077      }
1078      break;
1079
1080   case TGSI_FILE_INPUT:
1081      for (i = 0; i < QUAD_SIZE; i++) {
1082         /*
1083         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1084            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1085                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1086                         index2D->i[i], index->i[i]);
1087                         }*/
1088         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1089         assert(pos >= 0);
1090         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1091         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1092      }
1093      break;
1094
1095   case TGSI_FILE_SYSTEM_VALUE:
1096      /* XXX no swizzling at this point.  Will be needed if we put
1097       * gl_FragCoord, for example, in a sys value register.
1098       */
1099      for (i = 0; i < QUAD_SIZE; i++) {
1100         chan->f[i] = mach->SystemValue[index->i[i]][0];
1101      }
1102      break;
1103
1104   case TGSI_FILE_TEMPORARY:
1105      for (i = 0; i < QUAD_SIZE; i++) {
1106         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1107         assert(index2D->i[i] == 0);
1108
1109         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1110      }
1111      break;
1112
1113   case TGSI_FILE_TEMPORARY_ARRAY:
1114      for (i = 0; i < QUAD_SIZE; i++) {
1115         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1116         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1117
1118         chan->u[i] =
1119            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1120      }
1121      break;
1122
1123   case TGSI_FILE_IMMEDIATE:
1124      for (i = 0; i < QUAD_SIZE; i++) {
1125         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1126         assert(index2D->i[i] == 0);
1127
1128         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1129      }
1130      break;
1131
1132   case TGSI_FILE_IMMEDIATE_ARRAY:
1133      for (i = 0; i < QUAD_SIZE; i++) {
1134         assert(index2D->i[i] == 0);
1135
1136         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1137      }
1138      break;
1139
1140   case TGSI_FILE_ADDRESS:
1141      for (i = 0; i < QUAD_SIZE; i++) {
1142         assert(index->i[i] >= 0);
1143         assert(index2D->i[i] == 0);
1144
1145         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1146      }
1147      break;
1148
1149   case TGSI_FILE_PREDICATE:
1150      for (i = 0; i < QUAD_SIZE; i++) {
1151         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1152         assert(index2D->i[i] == 0);
1153
1154         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1155      }
1156      break;
1157
1158   case TGSI_FILE_OUTPUT:
1159      /* vertex/fragment output vars can be read too */
1160      for (i = 0; i < QUAD_SIZE; i++) {
1161         assert(index->i[i] >= 0);
1162         assert(index2D->i[i] == 0);
1163
1164         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1165      }
1166      break;
1167
1168   default:
1169      assert(0);
1170      for (i = 0; i < QUAD_SIZE; i++) {
1171         chan->u[i] = 0;
1172      }
1173   }
1174}
1175
1176static void
1177fetch_source(const struct tgsi_exec_machine *mach,
1178             union tgsi_exec_channel *chan,
1179             const struct tgsi_full_src_register *reg,
1180             const uint chan_index,
1181             enum tgsi_exec_datatype src_datatype)
1182{
1183   union tgsi_exec_channel index;
1184   union tgsi_exec_channel index2D;
1185   uint swizzle;
1186
1187   /* We start with a direct index into a register file.
1188    *
1189    *    file[1],
1190    *    where:
1191    *       file = Register.File
1192    *       [1] = Register.Index
1193    */
1194   index.i[0] =
1195   index.i[1] =
1196   index.i[2] =
1197   index.i[3] = reg->Register.Index;
1198
1199   /* There is an extra source register that indirectly subscripts
1200    * a register file. The direct index now becomes an offset
1201    * that is being added to the indirect register.
1202    *
1203    *    file[ind[2].x+1],
1204    *    where:
1205    *       ind = Indirect.File
1206    *       [2] = Indirect.Index
1207    *       .x = Indirect.SwizzleX
1208    */
1209   if (reg->Register.Indirect) {
1210      union tgsi_exec_channel index2;
1211      union tgsi_exec_channel indir_index;
1212      const uint execmask = mach->ExecMask;
1213      uint i;
1214
1215      /* which address register (always zero now) */
1216      index2.i[0] =
1217      index2.i[1] =
1218      index2.i[2] =
1219      index2.i[3] = reg->Indirect.Index;
1220      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1221      /* get current value of address register[swizzle] */
1222      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1223      fetch_src_file_channel(mach,
1224                             reg->Indirect.File,
1225                             swizzle,
1226                             &index2,
1227                             &ZeroVec,
1228                             &indir_index);
1229
1230      /* add value of address register to the offset */
1231      index.i[0] += indir_index.i[0];
1232      index.i[1] += indir_index.i[1];
1233      index.i[2] += indir_index.i[2];
1234      index.i[3] += indir_index.i[3];
1235
1236      /* for disabled execution channels, zero-out the index to
1237       * avoid using a potential garbage value.
1238       */
1239      for (i = 0; i < QUAD_SIZE; i++) {
1240         if ((execmask & (1 << i)) == 0)
1241            index.i[i] = 0;
1242      }
1243   }
1244
1245   /* There is an extra source register that is a second
1246    * subscript to a register file. Effectively it means that
1247    * the register file is actually a 2D array of registers.
1248    *
1249    *    file[3][1],
1250    *    where:
1251    *       [3] = Dimension.Index
1252    */
1253   if (reg->Register.Dimension) {
1254      index2D.i[0] =
1255      index2D.i[1] =
1256      index2D.i[2] =
1257      index2D.i[3] = reg->Dimension.Index;
1258
1259      /* Again, the second subscript index can be addressed indirectly
1260       * identically to the first one.
1261       * Nothing stops us from indirectly addressing the indirect register,
1262       * but there is no need for that, so we won't exercise it.
1263       *
1264       *    file[ind[4].y+3][1],
1265       *    where:
1266       *       ind = DimIndirect.File
1267       *       [4] = DimIndirect.Index
1268       *       .y = DimIndirect.SwizzleX
1269       */
1270      if (reg->Dimension.Indirect) {
1271         union tgsi_exec_channel index2;
1272         union tgsi_exec_channel indir_index;
1273         const uint execmask = mach->ExecMask;
1274         uint i;
1275
1276         index2.i[0] =
1277         index2.i[1] =
1278         index2.i[2] =
1279         index2.i[3] = reg->DimIndirect.Index;
1280
1281         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1282         fetch_src_file_channel(mach,
1283                                reg->DimIndirect.File,
1284                                swizzle,
1285                                &index2,
1286                                &ZeroVec,
1287                                &indir_index);
1288
1289         index2D.i[0] += indir_index.i[0];
1290         index2D.i[1] += indir_index.i[1];
1291         index2D.i[2] += indir_index.i[2];
1292         index2D.i[3] += indir_index.i[3];
1293
1294         /* for disabled execution channels, zero-out the index to
1295          * avoid using a potential garbage value.
1296          */
1297         for (i = 0; i < QUAD_SIZE; i++) {
1298            if ((execmask & (1 << i)) == 0) {
1299               index2D.i[i] = 0;
1300            }
1301         }
1302      }
1303
1304      /* If by any chance there was a need for a 3D array of register
1305       * files, we would have to check whether Dimension is followed
1306       * by a dimension register and continue the saga.
1307       */
1308   } else {
1309      index2D.i[0] =
1310      index2D.i[1] =
1311      index2D.i[2] =
1312      index2D.i[3] = 0;
1313   }
1314
1315   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1316   fetch_src_file_channel(mach,
1317                          reg->Register.File,
1318                          swizzle,
1319                          &index,
1320                          &index2D,
1321                          chan);
1322
1323   if (reg->Register.Absolute) {
1324      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1325         micro_abs(chan, chan);
1326      } else {
1327         micro_iabs(chan, chan);
1328      }
1329   }
1330
1331   if (reg->Register.Negate) {
1332      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1333         micro_neg(chan, chan);
1334      } else {
1335         micro_ineg(chan, chan);
1336      }
1337   }
1338}
1339
1340static void
1341store_dest(struct tgsi_exec_machine *mach,
1342           const union tgsi_exec_channel *chan,
1343           const struct tgsi_full_dst_register *reg,
1344           const struct tgsi_full_instruction *inst,
1345           uint chan_index,
1346           enum tgsi_exec_datatype dst_datatype)
1347{
1348   uint i;
1349   union tgsi_exec_channel null;
1350   union tgsi_exec_channel *dst;
1351   union tgsi_exec_channel index2D;
1352   uint execmask = mach->ExecMask;
1353   int offset = 0;  /* indirection offset */
1354   int index;
1355
1356   /* for debugging */
1357   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1358      check_inf_or_nan(chan);
1359   }
1360
1361   /* There is an extra source register that indirectly subscripts
1362    * a register file. The direct index now becomes an offset
1363    * that is being added to the indirect register.
1364    *
1365    *    file[ind[2].x+1],
1366    *    where:
1367    *       ind = Indirect.File
1368    *       [2] = Indirect.Index
1369    *       .x = Indirect.SwizzleX
1370    */
1371   if (reg->Register.Indirect) {
1372      union tgsi_exec_channel index;
1373      union tgsi_exec_channel indir_index;
1374      uint swizzle;
1375
1376      /* which address register (always zero for now) */
1377      index.i[0] =
1378      index.i[1] =
1379      index.i[2] =
1380      index.i[3] = reg->Indirect.Index;
1381
1382      /* get current value of address register[swizzle] */
1383      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1384
1385      /* fetch values from the address/indirection register */
1386      fetch_src_file_channel(mach,
1387                             reg->Indirect.File,
1388                             swizzle,
1389                             &index,
1390                             &ZeroVec,
1391                             &indir_index);
1392
1393      /* save indirection offset */
1394      offset = indir_index.i[0];
1395   }
1396
1397   /* There is an extra source register that is a second
1398    * subscript to a register file. Effectively it means that
1399    * the register file is actually a 2D array of registers.
1400    *
1401    *    file[3][1],
1402    *    where:
1403    *       [3] = Dimension.Index
1404    */
1405   if (reg->Register.Dimension) {
1406      index2D.i[0] =
1407      index2D.i[1] =
1408      index2D.i[2] =
1409      index2D.i[3] = reg->Dimension.Index;
1410
1411      /* Again, the second subscript index can be addressed indirectly
1412       * identically to the first one.
1413       * Nothing stops us from indirectly addressing the indirect register,
1414       * but there is no need for that, so we won't exercise it.
1415       *
1416       *    file[ind[4].y+3][1],
1417       *    where:
1418       *       ind = DimIndirect.File
1419       *       [4] = DimIndirect.Index
1420       *       .y = DimIndirect.SwizzleX
1421       */
1422      if (reg->Dimension.Indirect) {
1423         union tgsi_exec_channel index2;
1424         union tgsi_exec_channel indir_index;
1425         const uint execmask = mach->ExecMask;
1426         unsigned swizzle;
1427         uint i;
1428
1429         index2.i[0] =
1430         index2.i[1] =
1431         index2.i[2] =
1432         index2.i[3] = reg->DimIndirect.Index;
1433
1434         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1435         fetch_src_file_channel(mach,
1436                                reg->DimIndirect.File,
1437                                swizzle,
1438                                &index2,
1439                                &ZeroVec,
1440                                &indir_index);
1441
1442         index2D.i[0] += indir_index.i[0];
1443         index2D.i[1] += indir_index.i[1];
1444         index2D.i[2] += indir_index.i[2];
1445         index2D.i[3] += indir_index.i[3];
1446
1447         /* for disabled execution channels, zero-out the index to
1448          * avoid using a potential garbage value.
1449          */
1450         for (i = 0; i < QUAD_SIZE; i++) {
1451            if ((execmask & (1 << i)) == 0) {
1452               index2D.i[i] = 0;
1453            }
1454         }
1455      }
1456
1457      /* If by any chance there was a need for a 3D array of register
1458       * files, we would have to check whether Dimension is followed
1459       * by a dimension register and continue the saga.
1460       */
1461   } else {
1462      index2D.i[0] =
1463      index2D.i[1] =
1464      index2D.i[2] =
1465      index2D.i[3] = 0;
1466   }
1467
1468   switch (reg->Register.File) {
1469   case TGSI_FILE_NULL:
1470      dst = &null;
1471      break;
1472
1473   case TGSI_FILE_OUTPUT:
1474      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1475         + reg->Register.Index;
1476      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1477#if 0
1478      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1479         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1480         for (i = 0; i < QUAD_SIZE; i++)
1481            if (execmask & (1 << i))
1482               fprintf(stderr, "%f, ", chan->f[i]);
1483         fprintf(stderr, ")\n");
1484      }
1485#endif
1486      break;
1487
1488   case TGSI_FILE_TEMPORARY:
1489      index = reg->Register.Index;
1490      assert( index < TGSI_EXEC_NUM_TEMPS );
1491      dst = &mach->Temps[offset + index].xyzw[chan_index];
1492      break;
1493
1494   case TGSI_FILE_TEMPORARY_ARRAY:
1495      index = reg->Register.Index;
1496      assert( index < TGSI_EXEC_NUM_TEMPS );
1497      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1498      /* XXX we use index2D.i[0] here but somehow we might
1499       * end up with someone trying to store indirectly in
1500       * different buffers */
1501      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1502      break;
1503
1504   case TGSI_FILE_ADDRESS:
1505      index = reg->Register.Index;
1506      dst = &mach->Addrs[index].xyzw[chan_index];
1507      break;
1508
1509   case TGSI_FILE_PREDICATE:
1510      index = reg->Register.Index;
1511      assert(index < TGSI_EXEC_NUM_PREDS);
1512      dst = &mach->Predicates[index].xyzw[chan_index];
1513      break;
1514
1515   default:
1516      assert( 0 );
1517      return;
1518   }
1519
1520   if (inst->Instruction.Predicate) {
1521      uint swizzle;
1522      union tgsi_exec_channel *pred;
1523
1524      switch (chan_index) {
1525      case CHAN_X:
1526         swizzle = inst->Predicate.SwizzleX;
1527         break;
1528      case CHAN_Y:
1529         swizzle = inst->Predicate.SwizzleY;
1530         break;
1531      case CHAN_Z:
1532         swizzle = inst->Predicate.SwizzleZ;
1533         break;
1534      case CHAN_W:
1535         swizzle = inst->Predicate.SwizzleW;
1536         break;
1537      default:
1538         assert(0);
1539         return;
1540      }
1541
1542      assert(inst->Predicate.Index == 0);
1543
1544      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1545
1546      if (inst->Predicate.Negate) {
1547         for (i = 0; i < QUAD_SIZE; i++) {
1548            if (pred->u[i]) {
1549               execmask &= ~(1 << i);
1550            }
1551         }
1552      } else {
1553         for (i = 0; i < QUAD_SIZE; i++) {
1554            if (!pred->u[i]) {
1555               execmask &= ~(1 << i);
1556            }
1557         }
1558      }
1559   }
1560
1561   switch (inst->Instruction.Saturate) {
1562   case TGSI_SAT_NONE:
1563      for (i = 0; i < QUAD_SIZE; i++)
1564         if (execmask & (1 << i))
1565            dst->i[i] = chan->i[i];
1566      break;
1567
1568   case TGSI_SAT_ZERO_ONE:
1569      for (i = 0; i < QUAD_SIZE; i++)
1570         if (execmask & (1 << i)) {
1571            if (chan->f[i] < 0.0f)
1572               dst->f[i] = 0.0f;
1573            else if (chan->f[i] > 1.0f)
1574               dst->f[i] = 1.0f;
1575            else
1576               dst->i[i] = chan->i[i];
1577         }
1578      break;
1579
1580   case TGSI_SAT_MINUS_PLUS_ONE:
1581      for (i = 0; i < QUAD_SIZE; i++)
1582         if (execmask & (1 << i)) {
1583            if (chan->f[i] < -1.0f)
1584               dst->f[i] = -1.0f;
1585            else if (chan->f[i] > 1.0f)
1586               dst->f[i] = 1.0f;
1587            else
1588               dst->i[i] = chan->i[i];
1589         }
1590      break;
1591
1592   default:
1593      assert( 0 );
1594   }
1595}
1596
1597#define FETCH(VAL,INDEX,CHAN)\
1598    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1599
1600#define IFETCH(VAL,INDEX,CHAN)\
1601    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1602
1603
1604/**
1605 * Execute ARB-style KIL which is predicated by a src register.
1606 * Kill fragment if any of the four values is less than zero.
1607 */
1608static void
1609exec_kil(struct tgsi_exec_machine *mach,
1610         const struct tgsi_full_instruction *inst)
1611{
1612   uint uniquemask;
1613   uint chan_index;
1614   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1615   union tgsi_exec_channel r[1];
1616
1617   /* This mask stores component bits that were already tested. */
1618   uniquemask = 0;
1619
1620   for (chan_index = 0; chan_index < 4; chan_index++)
1621   {
1622      uint swizzle;
1623      uint i;
1624
1625      /* unswizzle channel */
1626      swizzle = tgsi_util_get_full_src_register_swizzle (
1627                        &inst->Src[0],
1628                        chan_index);
1629
1630      /* check if the component has not been already tested */
1631      if (uniquemask & (1 << swizzle))
1632         continue;
1633      uniquemask |= 1 << swizzle;
1634
1635      FETCH(&r[0], 0, chan_index);
1636      for (i = 0; i < 4; i++)
1637         if (r[0].f[i] < 0.0f)
1638            kilmask |= 1 << i;
1639   }
1640
1641   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1642}
1643
1644/**
1645 * Execute NVIDIA-style KIL which is predicated by a condition code.
1646 * Kill fragment if the condition code is TRUE.
1647 */
1648static void
1649exec_kilp(struct tgsi_exec_machine *mach,
1650          const struct tgsi_full_instruction *inst)
1651{
1652   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1653
1654   /* "unconditional" kil */
1655   kilmask = mach->ExecMask;
1656   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1657}
1658
1659static void
1660emit_vertex(struct tgsi_exec_machine *mach)
1661{
1662   /* FIXME: check for exec mask correctly
1663   unsigned i;
1664   for (i = 0; i < QUAD_SIZE; ++i) {
1665         if ((mach->ExecMask & (1 << i)))
1666   */
1667   if (mach->ExecMask) {
1668      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1669      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1670   }
1671}
1672
1673static void
1674emit_primitive(struct tgsi_exec_machine *mach)
1675{
1676   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1677   /* FIXME: check for exec mask correctly
1678   unsigned i;
1679   for (i = 0; i < QUAD_SIZE; ++i) {
1680         if ((mach->ExecMask & (1 << i)))
1681   */
1682   if (mach->ExecMask) {
1683      ++(*prim_count);
1684      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1685      mach->Primitives[*prim_count] = 0;
1686   }
1687}
1688
1689static void
1690conditional_emit_primitive(struct tgsi_exec_machine *mach)
1691{
1692   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1693      int emitted_verts =
1694         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1695      if (emitted_verts) {
1696         emit_primitive(mach);
1697      }
1698   }
1699}
1700
1701
1702/*
1703 * Fetch four texture samples using STR texture coordinates.
1704 */
1705static void
1706fetch_texel( struct tgsi_sampler *sampler,
1707             const union tgsi_exec_channel *s,
1708             const union tgsi_exec_channel *t,
1709             const union tgsi_exec_channel *p,
1710             const union tgsi_exec_channel *c0,
1711             enum tgsi_sampler_control control,
1712             union tgsi_exec_channel *r,
1713             union tgsi_exec_channel *g,
1714             union tgsi_exec_channel *b,
1715             union tgsi_exec_channel *a )
1716{
1717   uint j;
1718   float rgba[NUM_CHANNELS][QUAD_SIZE];
1719
1720   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1721
1722   for (j = 0; j < 4; j++) {
1723      r->f[j] = rgba[0][j];
1724      g->f[j] = rgba[1][j];
1725      b->f[j] = rgba[2][j];
1726      a->f[j] = rgba[3][j];
1727   }
1728}
1729
1730
1731#define TEX_MODIFIER_NONE           0
1732#define TEX_MODIFIER_PROJECTED      1
1733#define TEX_MODIFIER_LOD_BIAS       2
1734#define TEX_MODIFIER_EXPLICIT_LOD   3
1735
1736
1737static void
1738exec_tex(struct tgsi_exec_machine *mach,
1739         const struct tgsi_full_instruction *inst,
1740         uint modifier)
1741{
1742   const uint unit = inst->Src[1].Register.Index;
1743   union tgsi_exec_channel r[4];
1744   const union tgsi_exec_channel *lod = &ZeroVec;
1745   enum tgsi_sampler_control control;
1746   uint chan;
1747
1748   if (modifier != TEX_MODIFIER_NONE) {
1749      FETCH(&r[3], 0, CHAN_W);
1750      if (modifier != TEX_MODIFIER_PROJECTED) {
1751         lod = &r[3];
1752      }
1753   }
1754
1755   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1756      control = tgsi_sampler_lod_explicit;
1757   } else {
1758      control = tgsi_sampler_lod_bias;
1759   }
1760
1761   switch (inst->Texture.Texture) {
1762   case TGSI_TEXTURE_1D:
1763      FETCH(&r[0], 0, CHAN_X);
1764
1765      if (modifier == TEX_MODIFIER_PROJECTED) {
1766         micro_div(&r[0], &r[0], &r[3]);
1767      }
1768
1769      fetch_texel(mach->Samplers[unit],
1770                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1771                  control,
1772                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1773      break;
1774   case TGSI_TEXTURE_SHADOW1D:
1775      FETCH(&r[0], 0, CHAN_X);
1776      FETCH(&r[2], 0, CHAN_Z);
1777
1778      if (modifier == TEX_MODIFIER_PROJECTED) {
1779         micro_div(&r[0], &r[0], &r[3]);
1780      }
1781
1782      fetch_texel(mach->Samplers[unit],
1783                  &r[0], &ZeroVec, &r[2], lod,  /* S, T, P, LOD */
1784                  control,
1785                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1786      break;
1787
1788   case TGSI_TEXTURE_2D:
1789   case TGSI_TEXTURE_RECT:
1790   case TGSI_TEXTURE_SHADOW2D:
1791   case TGSI_TEXTURE_SHADOWRECT:
1792      FETCH(&r[0], 0, CHAN_X);
1793      FETCH(&r[1], 0, CHAN_Y);
1794      FETCH(&r[2], 0, CHAN_Z);
1795
1796      if (modifier == TEX_MODIFIER_PROJECTED) {
1797         micro_div(&r[0], &r[0], &r[3]);
1798         micro_div(&r[1], &r[1], &r[3]);
1799         micro_div(&r[2], &r[2], &r[3]);
1800      }
1801
1802      fetch_texel(mach->Samplers[unit],
1803                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1804                  control,
1805                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1806      break;
1807
1808   case TGSI_TEXTURE_1D_ARRAY:
1809      FETCH(&r[0], 0, CHAN_X);
1810      FETCH(&r[1], 0, CHAN_Y);
1811
1812      if (modifier == TEX_MODIFIER_PROJECTED) {
1813         micro_div(&r[0], &r[0], &r[3]);
1814      }
1815
1816      fetch_texel(mach->Samplers[unit],
1817                  &r[0], &r[1], &ZeroVec, lod,     /* S, T, P, LOD */
1818                  control,
1819                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1820      break;
1821   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1822      FETCH(&r[0], 0, CHAN_X);
1823      FETCH(&r[1], 0, CHAN_Y);
1824      FETCH(&r[2], 0, CHAN_Z);
1825
1826      if (modifier == TEX_MODIFIER_PROJECTED) {
1827         micro_div(&r[0], &r[0], &r[3]);
1828      }
1829
1830      fetch_texel(mach->Samplers[unit],
1831                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1832                  control,
1833                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1834      break;
1835
1836   case TGSI_TEXTURE_2D_ARRAY:
1837      FETCH(&r[0], 0, CHAN_X);
1838      FETCH(&r[1], 0, CHAN_Y);
1839      FETCH(&r[2], 0, CHAN_Z);
1840
1841      if (modifier == TEX_MODIFIER_PROJECTED) {
1842         micro_div(&r[0], &r[0], &r[3]);
1843         micro_div(&r[1], &r[1], &r[3]);
1844      }
1845
1846      fetch_texel(mach->Samplers[unit],
1847                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1848                  control,
1849                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1850      break;
1851   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1852      FETCH(&r[0], 0, CHAN_X);
1853      FETCH(&r[1], 0, CHAN_Y);
1854      FETCH(&r[2], 0, CHAN_Z);
1855      FETCH(&r[3], 0, CHAN_W);
1856
1857      fetch_texel(mach->Samplers[unit],
1858                  &r[0], &r[1], &r[2], &r[3],     /* S, T, P, LOD */
1859                  control,
1860                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1861      break;
1862   case TGSI_TEXTURE_3D:
1863   case TGSI_TEXTURE_CUBE:
1864      FETCH(&r[0], 0, CHAN_X);
1865      FETCH(&r[1], 0, CHAN_Y);
1866      FETCH(&r[2], 0, CHAN_Z);
1867
1868      if (modifier == TEX_MODIFIER_PROJECTED) {
1869         micro_div(&r[0], &r[0], &r[3]);
1870         micro_div(&r[1], &r[1], &r[3]);
1871         micro_div(&r[2], &r[2], &r[3]);
1872      }
1873
1874      fetch_texel(mach->Samplers[unit],
1875                  &r[0], &r[1], &r[2], lod,
1876                  control,
1877                  &r[0], &r[1], &r[2], &r[3]);
1878      break;
1879
1880   default:
1881      assert(0);
1882   }
1883
1884#if 0
1885   debug_printf("fetch r: %g %g %g %g\n",
1886         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
1887   debug_printf("fetch g: %g %g %g %g\n",
1888         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
1889   debug_printf("fetch b: %g %g %g %g\n",
1890         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
1891   debug_printf("fetch a: %g %g %g %g\n",
1892         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
1893#endif
1894
1895   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1896      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1897         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1898      }
1899   }
1900}
1901
1902static void
1903exec_txd(struct tgsi_exec_machine *mach,
1904         const struct tgsi_full_instruction *inst)
1905{
1906   const uint unit = inst->Src[3].Register.Index;
1907   union tgsi_exec_channel r[4];
1908   uint chan;
1909
1910   /*
1911    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1912    */
1913
1914   switch (inst->Texture.Texture) {
1915   case TGSI_TEXTURE_1D:
1916   case TGSI_TEXTURE_SHADOW1D:
1917
1918      FETCH(&r[0], 0, CHAN_X);
1919
1920      fetch_texel(mach->Samplers[unit],
1921                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1922                  tgsi_sampler_lod_bias,
1923                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1924      break;
1925
1926   case TGSI_TEXTURE_1D_ARRAY:
1927   case TGSI_TEXTURE_2D:
1928   case TGSI_TEXTURE_RECT:
1929   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1930   case TGSI_TEXTURE_SHADOW2D:
1931   case TGSI_TEXTURE_SHADOWRECT:
1932
1933      FETCH(&r[0], 0, CHAN_X);
1934      FETCH(&r[1], 0, CHAN_Y);
1935      FETCH(&r[2], 0, CHAN_Z);
1936
1937      fetch_texel(mach->Samplers[unit],
1938                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1939                  tgsi_sampler_lod_bias,
1940                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1941      break;
1942
1943   case TGSI_TEXTURE_2D_ARRAY:
1944   case TGSI_TEXTURE_3D:
1945   case TGSI_TEXTURE_CUBE:
1946
1947      FETCH(&r[0], 0, CHAN_X);
1948      FETCH(&r[1], 0, CHAN_Y);
1949      FETCH(&r[2], 0, CHAN_Z);
1950
1951      fetch_texel(mach->Samplers[unit],
1952                  &r[0], &r[1], &r[2], &ZeroVec,
1953                  tgsi_sampler_lod_bias,
1954                  &r[0], &r[1], &r[2], &r[3]);
1955      break;
1956
1957   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1958
1959      FETCH(&r[0], 0, CHAN_X);
1960      FETCH(&r[1], 0, CHAN_Y);
1961      FETCH(&r[2], 0, CHAN_Z);
1962      FETCH(&r[3], 0, CHAN_W);
1963
1964      fetch_texel(mach->Samplers[unit],
1965                  &r[0], &r[1], &r[2], &r[3],
1966                  tgsi_sampler_lod_bias,
1967                  &r[0], &r[1], &r[2], &r[3]);
1968      break;
1969
1970   default:
1971      assert(0);
1972   }
1973
1974   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1975      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1976         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1977      }
1978   }
1979}
1980
1981
1982static void
1983exec_txf(struct tgsi_exec_machine *mach,
1984	 const struct tgsi_full_instruction *inst)
1985{
1986   struct tgsi_sampler *sampler;
1987   const uint unit = inst->Src[2].Register.Index;
1988   union tgsi_exec_channel r[4];
1989   union tgsi_exec_channel offset[3];
1990   uint chan;
1991   float rgba[NUM_CHANNELS][QUAD_SIZE];
1992   int j;
1993   int8_t offsets[3];
1994
1995   if (inst->Texture.NumOffsets == 1) {
1996      union tgsi_exec_channel index;
1997      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
1998      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
1999                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2000      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2001                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2002      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2003                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2004     offsets[0] = offset[0].i[0];
2005     offsets[1] = offset[1].i[0];
2006     offsets[2] = offset[2].i[0];
2007   } else
2008     offsets[0] = offsets[1] = offsets[2] = 0;
2009
2010   IFETCH(&r[3], 0, CHAN_W);
2011
2012   switch(inst->Texture.Texture) {
2013   case TGSI_TEXTURE_3D:
2014   case TGSI_TEXTURE_2D_ARRAY:
2015   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2016      IFETCH(&r[2], 0, CHAN_Z);
2017      /* fallthrough */
2018   case TGSI_TEXTURE_2D:
2019   case TGSI_TEXTURE_RECT:
2020   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2021   case TGSI_TEXTURE_SHADOW2D:
2022   case TGSI_TEXTURE_SHADOWRECT:
2023   case TGSI_TEXTURE_1D_ARRAY:
2024      IFETCH(&r[1], 0, CHAN_Y);
2025      /* fallthrough */
2026   case TGSI_TEXTURE_1D:
2027   case TGSI_TEXTURE_SHADOW1D:
2028      IFETCH(&r[0], 0, CHAN_X);
2029      break;
2030   default:
2031      assert(0);
2032      break;
2033   }
2034
2035   sampler = mach->Samplers[unit];
2036   sampler->get_texel(sampler, r[0].i, r[1].i, r[2].i, r[3].i,
2037		      offsets, rgba);
2038
2039   for (j = 0; j < QUAD_SIZE; j++) {
2040      r[0].f[j] = rgba[0][j];
2041      r[1].f[j] = rgba[1][j];
2042      r[2].f[j] = rgba[2][j];
2043      r[3].f[j] = rgba[3][j];
2044   }
2045
2046   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2047      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2048         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2049      }
2050   }
2051}
2052
2053static void
2054exec_txq(struct tgsi_exec_machine *mach,
2055         const struct tgsi_full_instruction *inst)
2056{
2057   struct tgsi_sampler *sampler;
2058   const uint unit = inst->Src[1].Register.Index;
2059   int result[4];
2060   union tgsi_exec_channel r[4], src;
2061   uint chan;
2062   int i,j;
2063
2064   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_INT);
2065   sampler = mach->Samplers[unit];
2066
2067   sampler->get_dims(sampler, src.i[0], result);
2068
2069   for (i = 0; i < QUAD_SIZE; i++) {
2070      for (j = 0; j < 4; j++) {
2071	 r[j].i[i] = result[j];
2072      }
2073   }
2074
2075   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2076      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2077	 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2078		    TGSI_EXEC_DATA_INT);
2079      }
2080   }
2081}
2082
2083static void
2084exec_sample(struct tgsi_exec_machine *mach,
2085            const struct tgsi_full_instruction *inst,
2086            uint modifier)
2087{
2088   const uint resource_unit = inst->Src[1].Register.Index;
2089   const uint sampler_unit = inst->Src[2].Register.Index;
2090   union tgsi_exec_channel r[4];
2091   const union tgsi_exec_channel *lod = &ZeroVec;
2092   enum tgsi_sampler_control control;
2093   uint chan;
2094
2095   if (modifier != TEX_MODIFIER_NONE) {
2096      if (modifier == TEX_MODIFIER_LOD_BIAS)
2097         FETCH(&r[3], 3, CHAN_X);
2098      else /*TEX_MODIFIER_LOD*/
2099         FETCH(&r[3], 0, CHAN_W);
2100
2101      if (modifier != TEX_MODIFIER_PROJECTED) {
2102         lod = &r[3];
2103      }
2104   }
2105
2106   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2107      control = tgsi_sampler_lod_explicit;
2108   } else {
2109      control = tgsi_sampler_lod_bias;
2110   }
2111
2112   switch (mach->Resources[resource_unit].Resource) {
2113   case TGSI_TEXTURE_1D:
2114   case TGSI_TEXTURE_SHADOW1D:
2115      FETCH(&r[0], 0, CHAN_X);
2116
2117      if (modifier == TEX_MODIFIER_PROJECTED) {
2118         micro_div(&r[0], &r[0], &r[3]);
2119      }
2120
2121      fetch_texel(mach->Samplers[sampler_unit],
2122                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
2123                  control,
2124                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2125      break;
2126
2127   case TGSI_TEXTURE_1D_ARRAY:
2128   case TGSI_TEXTURE_2D:
2129   case TGSI_TEXTURE_RECT:
2130   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2131   case TGSI_TEXTURE_SHADOW2D:
2132   case TGSI_TEXTURE_SHADOWRECT:
2133      FETCH(&r[0], 0, CHAN_X);
2134      FETCH(&r[1], 0, CHAN_Y);
2135      FETCH(&r[2], 0, CHAN_Z);
2136
2137      if (modifier == TEX_MODIFIER_PROJECTED) {
2138         micro_div(&r[0], &r[0], &r[3]);
2139         micro_div(&r[1], &r[1], &r[3]);
2140         micro_div(&r[2], &r[2], &r[3]);
2141      }
2142
2143      fetch_texel(mach->Samplers[sampler_unit],
2144                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
2145                  control,
2146                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2147      break;
2148
2149   case TGSI_TEXTURE_2D_ARRAY:
2150   case TGSI_TEXTURE_3D:
2151   case TGSI_TEXTURE_CUBE:
2152      FETCH(&r[0], 0, CHAN_X);
2153      FETCH(&r[1], 0, CHAN_Y);
2154      FETCH(&r[2], 0, CHAN_Z);
2155
2156      if (modifier == TEX_MODIFIER_PROJECTED) {
2157         micro_div(&r[0], &r[0], &r[3]);
2158         micro_div(&r[1], &r[1], &r[3]);
2159         micro_div(&r[2], &r[2], &r[3]);
2160      }
2161
2162      fetch_texel(mach->Samplers[sampler_unit],
2163                  &r[0], &r[1], &r[2], lod,
2164                  control,
2165                  &r[0], &r[1], &r[2], &r[3]);
2166      break;
2167
2168   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2169      FETCH(&r[0], 0, CHAN_X);
2170      FETCH(&r[1], 0, CHAN_Y);
2171      FETCH(&r[2], 0, CHAN_Z);
2172      FETCH(&r[3], 0, CHAN_W);
2173
2174      assert(modifier != TEX_MODIFIER_PROJECTED);
2175
2176      fetch_texel(mach->Samplers[sampler_unit],
2177                  &r[0], &r[1], &r[2], &r[3],
2178                  control,
2179                  &r[0], &r[1], &r[2], &r[3]);
2180      break;
2181
2182   default:
2183      assert(0);
2184   }
2185
2186   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2187      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2188         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2189      }
2190   }
2191}
2192
2193static void
2194exec_sample_d(struct tgsi_exec_machine *mach,
2195              const struct tgsi_full_instruction *inst)
2196{
2197   const uint resource_unit = inst->Src[1].Register.Index;
2198   const uint sampler_unit = inst->Src[2].Register.Index;
2199   union tgsi_exec_channel r[4];
2200   uint chan;
2201   /*
2202    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
2203    */
2204
2205   switch (mach->Resources[resource_unit].Resource) {
2206   case TGSI_TEXTURE_1D:
2207   case TGSI_TEXTURE_SHADOW1D:
2208
2209      FETCH(&r[0], 0, CHAN_X);
2210
2211      fetch_texel(mach->Samplers[sampler_unit],
2212                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
2213                  tgsi_sampler_lod_bias,
2214                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2215      break;
2216
2217   case TGSI_TEXTURE_2D:
2218   case TGSI_TEXTURE_RECT:
2219   case TGSI_TEXTURE_SHADOW2D:
2220   case TGSI_TEXTURE_SHADOWRECT:
2221
2222      FETCH(&r[0], 0, CHAN_X);
2223      FETCH(&r[1], 0, CHAN_Y);
2224      FETCH(&r[2], 0, CHAN_Z);
2225
2226      fetch_texel(mach->Samplers[sampler_unit],
2227                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2228                  tgsi_sampler_lod_bias,
2229                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2230      break;
2231
2232   case TGSI_TEXTURE_3D:
2233   case TGSI_TEXTURE_CUBE:
2234
2235      FETCH(&r[0], 0, CHAN_X);
2236      FETCH(&r[1], 0, CHAN_Y);
2237      FETCH(&r[2], 0, CHAN_Z);
2238
2239      fetch_texel(mach->Samplers[sampler_unit],
2240                  &r[0], &r[1], &r[2], &ZeroVec,
2241                  tgsi_sampler_lod_bias,
2242                  &r[0], &r[1], &r[2], &r[3]);
2243      break;
2244
2245   default:
2246      assert(0);
2247   }
2248
2249   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2250      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2251         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2252      }
2253   }
2254}
2255
2256
2257/**
2258 * Evaluate a constant-valued coefficient at the position of the
2259 * current quad.
2260 */
2261static void
2262eval_constant_coef(
2263   struct tgsi_exec_machine *mach,
2264   unsigned attrib,
2265   unsigned chan )
2266{
2267   unsigned i;
2268
2269   for( i = 0; i < QUAD_SIZE; i++ ) {
2270      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2271   }
2272}
2273
2274/**
2275 * Evaluate a linear-valued coefficient at the position of the
2276 * current quad.
2277 */
2278static void
2279eval_linear_coef(
2280   struct tgsi_exec_machine *mach,
2281   unsigned attrib,
2282   unsigned chan )
2283{
2284   const float x = mach->QuadPos.xyzw[0].f[0];
2285   const float y = mach->QuadPos.xyzw[1].f[0];
2286   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2287   const float dady = mach->InterpCoefs[attrib].dady[chan];
2288   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2289   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2290   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2291   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2292   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2293}
2294
2295/**
2296 * Evaluate a perspective-valued coefficient at the position of the
2297 * current quad.
2298 */
2299static void
2300eval_perspective_coef(
2301   struct tgsi_exec_machine *mach,
2302   unsigned attrib,
2303   unsigned chan )
2304{
2305   const float x = mach->QuadPos.xyzw[0].f[0];
2306   const float y = mach->QuadPos.xyzw[1].f[0];
2307   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2308   const float dady = mach->InterpCoefs[attrib].dady[chan];
2309   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2310   const float *w = mach->QuadPos.xyzw[3].f;
2311   /* divide by W here */
2312   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2313   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2314   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2315   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2316}
2317
2318
2319typedef void (* eval_coef_func)(
2320   struct tgsi_exec_machine *mach,
2321   unsigned attrib,
2322   unsigned chan );
2323
2324static void
2325exec_declaration(struct tgsi_exec_machine *mach,
2326                 const struct tgsi_full_declaration *decl)
2327{
2328   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
2329      mach->Resources[decl->Range.First] = decl->Resource;
2330      return;
2331   }
2332
2333   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2334      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2335         uint first, last, mask;
2336
2337         first = decl->Range.First;
2338         last = decl->Range.Last;
2339         mask = decl->Declaration.UsageMask;
2340
2341         /* XXX we could remove this special-case code since
2342          * mach->InterpCoefs[first].a0 should already have the
2343          * front/back-face value.  But we should first update the
2344          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2345          * Then, we could remove the tgsi_exec_machine::Face field.
2346          */
2347         /* XXX make FACE a system value */
2348         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2349            uint i;
2350
2351            assert(decl->Semantic.Index == 0);
2352            assert(first == last);
2353
2354            for (i = 0; i < QUAD_SIZE; i++) {
2355               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2356            }
2357         } else {
2358            eval_coef_func eval;
2359            uint i, j;
2360
2361            switch (decl->Declaration.Interpolate) {
2362            case TGSI_INTERPOLATE_CONSTANT:
2363               eval = eval_constant_coef;
2364               break;
2365
2366            case TGSI_INTERPOLATE_LINEAR:
2367               eval = eval_linear_coef;
2368               break;
2369
2370            case TGSI_INTERPOLATE_PERSPECTIVE:
2371               eval = eval_perspective_coef;
2372               break;
2373
2374            default:
2375               assert(0);
2376               return;
2377            }
2378
2379            for (j = 0; j < NUM_CHANNELS; j++) {
2380               if (mask & (1 << j)) {
2381                  for (i = first; i <= last; i++) {
2382                     eval(mach, i, j);
2383                  }
2384               }
2385            }
2386         }
2387      }
2388   }
2389
2390   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2391      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2392   }
2393}
2394
2395
2396typedef void (* micro_op)(union tgsi_exec_channel *dst);
2397
2398static void
2399exec_vector(struct tgsi_exec_machine *mach,
2400            const struct tgsi_full_instruction *inst,
2401            micro_op op,
2402            enum tgsi_exec_datatype dst_datatype)
2403{
2404   unsigned int chan;
2405
2406   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2407      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2408         union tgsi_exec_channel dst;
2409
2410         op(&dst);
2411         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2412      }
2413   }
2414}
2415
2416typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2417                                const union tgsi_exec_channel *src);
2418
2419static void
2420exec_scalar_unary(struct tgsi_exec_machine *mach,
2421                  const struct tgsi_full_instruction *inst,
2422                  micro_unary_op op,
2423                  enum tgsi_exec_datatype dst_datatype,
2424                  enum tgsi_exec_datatype src_datatype)
2425{
2426   unsigned int chan;
2427   union tgsi_exec_channel src;
2428   union tgsi_exec_channel dst;
2429
2430   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
2431   op(&dst, &src);
2432   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2433      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2434         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2435      }
2436   }
2437}
2438
2439static void
2440exec_vector_unary(struct tgsi_exec_machine *mach,
2441                  const struct tgsi_full_instruction *inst,
2442                  micro_unary_op op,
2443                  enum tgsi_exec_datatype dst_datatype,
2444                  enum tgsi_exec_datatype src_datatype)
2445{
2446   unsigned int chan;
2447   struct tgsi_exec_vector dst;
2448
2449   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2450      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2451         union tgsi_exec_channel src;
2452
2453         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2454         op(&dst.xyzw[chan], &src);
2455      }
2456   }
2457   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2458      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2459         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2460      }
2461   }
2462}
2463
2464typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2465                                 const union tgsi_exec_channel *src0,
2466                                 const union tgsi_exec_channel *src1);
2467
2468static void
2469exec_scalar_binary(struct tgsi_exec_machine *mach,
2470                   const struct tgsi_full_instruction *inst,
2471                   micro_binary_op op,
2472                   enum tgsi_exec_datatype dst_datatype,
2473                   enum tgsi_exec_datatype src_datatype)
2474{
2475   unsigned int chan;
2476   union tgsi_exec_channel src[2];
2477   union tgsi_exec_channel dst;
2478
2479   fetch_source(mach, &src[0], &inst->Src[0], CHAN_X, src_datatype);
2480   fetch_source(mach, &src[1], &inst->Src[1], CHAN_Y, src_datatype);
2481   op(&dst, &src[0], &src[1]);
2482   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2483      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2484         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2485      }
2486   }
2487}
2488
2489static void
2490exec_vector_binary(struct tgsi_exec_machine *mach,
2491                   const struct tgsi_full_instruction *inst,
2492                   micro_binary_op op,
2493                   enum tgsi_exec_datatype dst_datatype,
2494                   enum tgsi_exec_datatype src_datatype)
2495{
2496   unsigned int chan;
2497   struct tgsi_exec_vector dst;
2498
2499   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2500      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2501         union tgsi_exec_channel src[2];
2502
2503         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2504         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2505         op(&dst.xyzw[chan], &src[0], &src[1]);
2506      }
2507   }
2508   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2509      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2510         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2511      }
2512   }
2513}
2514
2515typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2516                                  const union tgsi_exec_channel *src0,
2517                                  const union tgsi_exec_channel *src1,
2518                                  const union tgsi_exec_channel *src2);
2519
2520static void
2521exec_vector_trinary(struct tgsi_exec_machine *mach,
2522                    const struct tgsi_full_instruction *inst,
2523                    micro_trinary_op op,
2524                    enum tgsi_exec_datatype dst_datatype,
2525                    enum tgsi_exec_datatype src_datatype)
2526{
2527   unsigned int chan;
2528   struct tgsi_exec_vector dst;
2529
2530   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2531      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2532         union tgsi_exec_channel src[3];
2533
2534         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2535         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2536         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2537         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2538      }
2539   }
2540   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2541      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2542         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2543      }
2544   }
2545}
2546
2547static void
2548exec_dp3(struct tgsi_exec_machine *mach,
2549         const struct tgsi_full_instruction *inst)
2550{
2551   unsigned int chan;
2552   union tgsi_exec_channel arg[3];
2553
2554   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2555   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2556   micro_mul(&arg[2], &arg[0], &arg[1]);
2557
2558   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2559      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2560      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2561      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2562   }
2563
2564   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2565      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2566         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2567      }
2568   }
2569}
2570
2571static void
2572exec_dp4(struct tgsi_exec_machine *mach,
2573         const struct tgsi_full_instruction *inst)
2574{
2575   unsigned int chan;
2576   union tgsi_exec_channel arg[3];
2577
2578   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2579   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2580   micro_mul(&arg[2], &arg[0], &arg[1]);
2581
2582   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2583      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2584      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2585      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2586   }
2587
2588   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2589      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2590         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2591      }
2592   }
2593}
2594
2595static void
2596exec_dp2a(struct tgsi_exec_machine *mach,
2597          const struct tgsi_full_instruction *inst)
2598{
2599   unsigned int chan;
2600   union tgsi_exec_channel arg[3];
2601
2602   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2603   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2604   micro_mul(&arg[2], &arg[0], &arg[1]);
2605
2606   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2607   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2608   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2609
2610   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2611   micro_add(&arg[0], &arg[0], &arg[1]);
2612
2613   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2614      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2615         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2616      }
2617   }
2618}
2619
2620static void
2621exec_dph(struct tgsi_exec_machine *mach,
2622         const struct tgsi_full_instruction *inst)
2623{
2624   unsigned int chan;
2625   union tgsi_exec_channel arg[3];
2626
2627   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2628   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2629   micro_mul(&arg[2], &arg[0], &arg[1]);
2630
2631   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2632   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2633   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2634
2635   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2636   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2637   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2638
2639   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2640   micro_add(&arg[0], &arg[0], &arg[1]);
2641
2642   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2643      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2644         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2645      }
2646   }
2647}
2648
2649static void
2650exec_dp2(struct tgsi_exec_machine *mach,
2651         const struct tgsi_full_instruction *inst)
2652{
2653   unsigned int chan;
2654   union tgsi_exec_channel arg[3];
2655
2656   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2657   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2658   micro_mul(&arg[2], &arg[0], &arg[1]);
2659
2660   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2661   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2662   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2663
2664   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2665      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2666         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2667      }
2668   }
2669}
2670
2671static void
2672exec_nrm4(struct tgsi_exec_machine *mach,
2673          const struct tgsi_full_instruction *inst)
2674{
2675   unsigned int chan;
2676   union tgsi_exec_channel arg[4];
2677   union tgsi_exec_channel scale;
2678
2679   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2680   micro_mul(&scale, &arg[0], &arg[0]);
2681
2682   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2683      union tgsi_exec_channel product;
2684
2685      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2686      micro_mul(&product, &arg[chan], &arg[chan]);
2687      micro_add(&scale, &scale, &product);
2688   }
2689
2690   micro_rsq(&scale, &scale);
2691
2692   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2693      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2694         micro_mul(&arg[chan], &arg[chan], &scale);
2695         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2696      }
2697   }
2698}
2699
2700static void
2701exec_nrm3(struct tgsi_exec_machine *mach,
2702          const struct tgsi_full_instruction *inst)
2703{
2704   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2705      unsigned int chan;
2706      union tgsi_exec_channel arg[3];
2707      union tgsi_exec_channel scale;
2708
2709      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2710      micro_mul(&scale, &arg[0], &arg[0]);
2711
2712      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2713         union tgsi_exec_channel product;
2714
2715         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2716         micro_mul(&product, &arg[chan], &arg[chan]);
2717         micro_add(&scale, &scale, &product);
2718      }
2719
2720      micro_rsq(&scale, &scale);
2721
2722      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2723         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2724            micro_mul(&arg[chan], &arg[chan], &scale);
2725            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2726         }
2727      }
2728   }
2729
2730   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2731      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2732   }
2733}
2734
2735static void
2736exec_scs(struct tgsi_exec_machine *mach,
2737         const struct tgsi_full_instruction *inst)
2738{
2739   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2740      union tgsi_exec_channel arg;
2741      union tgsi_exec_channel result;
2742
2743      fetch_source(mach, &arg, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2744
2745      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2746         micro_cos(&result, &arg);
2747         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2748      }
2749      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2750         micro_sin(&result, &arg);
2751         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2752      }
2753   }
2754   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2755      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2756   }
2757   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2758      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2759   }
2760}
2761
2762static void
2763exec_x2d(struct tgsi_exec_machine *mach,
2764         const struct tgsi_full_instruction *inst)
2765{
2766   union tgsi_exec_channel r[4];
2767   union tgsi_exec_channel d[2];
2768
2769   fetch_source(mach, &r[0], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2770   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2771   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2772      fetch_source(mach, &r[2], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2773      micro_mul(&r[2], &r[2], &r[0]);
2774      fetch_source(mach, &r[3], &inst->Src[2], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2775      micro_mul(&r[3], &r[3], &r[1]);
2776      micro_add(&r[2], &r[2], &r[3]);
2777      fetch_source(mach, &r[3], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2778      micro_add(&d[0], &r[2], &r[3]);
2779   }
2780   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2781      fetch_source(mach, &r[2], &inst->Src[2], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2782      micro_mul(&r[2], &r[2], &r[0]);
2783      fetch_source(mach, &r[3], &inst->Src[2], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2784      micro_mul(&r[3], &r[3], &r[1]);
2785      micro_add(&r[2], &r[2], &r[3]);
2786      fetch_source(mach, &r[3], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2787      micro_add(&d[1], &r[2], &r[3]);
2788   }
2789   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2790      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2791   }
2792   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2793      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2794   }
2795   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2796      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2797   }
2798   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2799      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2800   }
2801}
2802
2803static void
2804exec_rfl(struct tgsi_exec_machine *mach,
2805         const struct tgsi_full_instruction *inst)
2806{
2807   union tgsi_exec_channel r[9];
2808
2809   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2810      /* r0 = dp3(src0, src0) */
2811      fetch_source(mach, &r[2], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2812      micro_mul(&r[0], &r[2], &r[2]);
2813      fetch_source(mach, &r[4], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2814      micro_mul(&r[8], &r[4], &r[4]);
2815      micro_add(&r[0], &r[0], &r[8]);
2816      fetch_source(mach, &r[6], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2817      micro_mul(&r[8], &r[6], &r[6]);
2818      micro_add(&r[0], &r[0], &r[8]);
2819
2820      /* r1 = dp3(src0, src1) */
2821      fetch_source(mach, &r[3], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2822      micro_mul(&r[1], &r[2], &r[3]);
2823      fetch_source(mach, &r[5], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2824      micro_mul(&r[8], &r[4], &r[5]);
2825      micro_add(&r[1], &r[1], &r[8]);
2826      fetch_source(mach, &r[7], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2827      micro_mul(&r[8], &r[6], &r[7]);
2828      micro_add(&r[1], &r[1], &r[8]);
2829
2830      /* r1 = 2 * r1 / r0 */
2831      micro_add(&r[1], &r[1], &r[1]);
2832      micro_div(&r[1], &r[1], &r[0]);
2833
2834      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2835         micro_mul(&r[2], &r[2], &r[1]);
2836         micro_sub(&r[2], &r[2], &r[3]);
2837         store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2838      }
2839      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2840         micro_mul(&r[4], &r[4], &r[1]);
2841         micro_sub(&r[4], &r[4], &r[5]);
2842         store_dest(mach, &r[4], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2843      }
2844      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2845         micro_mul(&r[6], &r[6], &r[1]);
2846         micro_sub(&r[6], &r[6], &r[7]);
2847         store_dest(mach, &r[6], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2848      }
2849   }
2850   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2851      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2852   }
2853}
2854
2855static void
2856exec_xpd(struct tgsi_exec_machine *mach,
2857         const struct tgsi_full_instruction *inst)
2858{
2859   union tgsi_exec_channel r[6];
2860   union tgsi_exec_channel d[3];
2861
2862   fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2863   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2864
2865   micro_mul(&r[2], &r[0], &r[1]);
2866
2867   fetch_source(mach, &r[3], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2868   fetch_source(mach, &r[4], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2869
2870   micro_mul(&r[5], &r[3], &r[4] );
2871   micro_sub(&d[CHAN_X], &r[2], &r[5]);
2872
2873   fetch_source(mach, &r[2], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2874
2875   micro_mul(&r[3], &r[3], &r[2]);
2876
2877   fetch_source(mach, &r[5], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2878
2879   micro_mul(&r[1], &r[1], &r[5]);
2880   micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2881
2882   micro_mul(&r[5], &r[5], &r[4]);
2883   micro_mul(&r[0], &r[0], &r[2]);
2884   micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2885
2886   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2887      store_dest(mach, &d[CHAN_X], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2888   }
2889   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2890      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2891   }
2892   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2893      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2894   }
2895   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2896      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2897   }
2898}
2899
2900static void
2901exec_dst(struct tgsi_exec_machine *mach,
2902         const struct tgsi_full_instruction *inst)
2903{
2904   union tgsi_exec_channel r[2];
2905   union tgsi_exec_channel d[4];
2906
2907   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2908      fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2909      fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2910      micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2911   }
2912   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2913      fetch_source(mach, &d[CHAN_Z], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2914   }
2915   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2916      fetch_source(mach, &d[CHAN_W], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2917   }
2918
2919   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2920      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2921   }
2922   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2923      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2924   }
2925   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2926      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2927   }
2928   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2929      store_dest(mach, &d[CHAN_W], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2930   }
2931}
2932
2933static void
2934exec_log(struct tgsi_exec_machine *mach,
2935         const struct tgsi_full_instruction *inst)
2936{
2937   union tgsi_exec_channel r[3];
2938
2939   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2940   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2941   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2942   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2943   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2944      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2945   }
2946   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2947      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2948      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2949      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2950   }
2951   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2952      store_dest(mach, &r[1], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2953   }
2954   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2955      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2956   }
2957}
2958
2959static void
2960exec_exp(struct tgsi_exec_machine *mach,
2961         const struct tgsi_full_instruction *inst)
2962{
2963   union tgsi_exec_channel r[3];
2964
2965   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2966   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2967   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2968      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2969      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2970   }
2971   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2972      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2973      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2974   }
2975   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2976      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2977      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2978   }
2979   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2980      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2981   }
2982}
2983
2984static void
2985exec_lit(struct tgsi_exec_machine *mach,
2986         const struct tgsi_full_instruction *inst)
2987{
2988   union tgsi_exec_channel r[3];
2989   union tgsi_exec_channel d[3];
2990
2991   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
2992      fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2993      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2994         fetch_source(mach, &r[1], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2995         micro_max(&r[1], &r[1], &ZeroVec);
2996
2997         fetch_source(mach, &r[2], &inst->Src[0], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2998         micro_min(&r[2], &r[2], &P128Vec);
2999         micro_max(&r[2], &r[2], &M128Vec);
3000         micro_pow(&r[1], &r[1], &r[2]);
3001         micro_lt(&d[CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3002         store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3003      }
3004      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3005         micro_max(&d[CHAN_Y], &r[0], &ZeroVec);
3006         store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3007      }
3008   }
3009   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3010      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
3011   }
3012
3013   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3014      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
3015   }
3016}
3017
3018static void
3019exec_break(struct tgsi_exec_machine *mach)
3020{
3021   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3022      /* turn off loop channels for each enabled exec channel */
3023      mach->LoopMask &= ~mach->ExecMask;
3024      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3025      UPDATE_EXEC_MASK(mach);
3026   } else {
3027      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3028
3029      mach->Switch.mask = 0x0;
3030
3031      UPDATE_EXEC_MASK(mach);
3032   }
3033}
3034
3035static void
3036exec_switch(struct tgsi_exec_machine *mach,
3037            const struct tgsi_full_instruction *inst)
3038{
3039   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3040   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3041
3042   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3043   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
3044   mach->Switch.mask = 0x0;
3045   mach->Switch.defaultMask = 0x0;
3046
3047   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3048   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3049
3050   UPDATE_EXEC_MASK(mach);
3051}
3052
3053static void
3054exec_case(struct tgsi_exec_machine *mach,
3055          const struct tgsi_full_instruction *inst)
3056{
3057   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3058   union tgsi_exec_channel src;
3059   uint mask = 0;
3060
3061   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
3062
3063   if (mach->Switch.selector.u[0] == src.u[0]) {
3064      mask |= 0x1;
3065   }
3066   if (mach->Switch.selector.u[1] == src.u[1]) {
3067      mask |= 0x2;
3068   }
3069   if (mach->Switch.selector.u[2] == src.u[2]) {
3070      mask |= 0x4;
3071   }
3072   if (mach->Switch.selector.u[3] == src.u[3]) {
3073      mask |= 0x8;
3074   }
3075
3076   mach->Switch.defaultMask |= mask;
3077
3078   mach->Switch.mask |= mask & prevMask;
3079
3080   UPDATE_EXEC_MASK(mach);
3081}
3082
3083static void
3084exec_default(struct tgsi_exec_machine *mach)
3085{
3086   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3087
3088   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3089
3090   UPDATE_EXEC_MASK(mach);
3091}
3092
3093static void
3094exec_endswitch(struct tgsi_exec_machine *mach)
3095{
3096   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3097   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3098
3099   UPDATE_EXEC_MASK(mach);
3100}
3101
3102static void
3103micro_i2f(union tgsi_exec_channel *dst,
3104          const union tgsi_exec_channel *src)
3105{
3106   dst->f[0] = (float)src->i[0];
3107   dst->f[1] = (float)src->i[1];
3108   dst->f[2] = (float)src->i[2];
3109   dst->f[3] = (float)src->i[3];
3110}
3111
3112static void
3113micro_not(union tgsi_exec_channel *dst,
3114          const union tgsi_exec_channel *src)
3115{
3116   dst->u[0] = ~src->u[0];
3117   dst->u[1] = ~src->u[1];
3118   dst->u[2] = ~src->u[2];
3119   dst->u[3] = ~src->u[3];
3120}
3121
3122static void
3123micro_shl(union tgsi_exec_channel *dst,
3124          const union tgsi_exec_channel *src0,
3125          const union tgsi_exec_channel *src1)
3126{
3127   dst->u[0] = src0->u[0] << src1->u[0];
3128   dst->u[1] = src0->u[1] << src1->u[1];
3129   dst->u[2] = src0->u[2] << src1->u[2];
3130   dst->u[3] = src0->u[3] << src1->u[3];
3131}
3132
3133static void
3134micro_and(union tgsi_exec_channel *dst,
3135          const union tgsi_exec_channel *src0,
3136          const union tgsi_exec_channel *src1)
3137{
3138   dst->u[0] = src0->u[0] & src1->u[0];
3139   dst->u[1] = src0->u[1] & src1->u[1];
3140   dst->u[2] = src0->u[2] & src1->u[2];
3141   dst->u[3] = src0->u[3] & src1->u[3];
3142}
3143
3144static void
3145micro_or(union tgsi_exec_channel *dst,
3146         const union tgsi_exec_channel *src0,
3147         const union tgsi_exec_channel *src1)
3148{
3149   dst->u[0] = src0->u[0] | src1->u[0];
3150   dst->u[1] = src0->u[1] | src1->u[1];
3151   dst->u[2] = src0->u[2] | src1->u[2];
3152   dst->u[3] = src0->u[3] | src1->u[3];
3153}
3154
3155static void
3156micro_xor(union tgsi_exec_channel *dst,
3157          const union tgsi_exec_channel *src0,
3158          const union tgsi_exec_channel *src1)
3159{
3160   dst->u[0] = src0->u[0] ^ src1->u[0];
3161   dst->u[1] = src0->u[1] ^ src1->u[1];
3162   dst->u[2] = src0->u[2] ^ src1->u[2];
3163   dst->u[3] = src0->u[3] ^ src1->u[3];
3164}
3165
3166static void
3167micro_mod(union tgsi_exec_channel *dst,
3168          const union tgsi_exec_channel *src0,
3169          const union tgsi_exec_channel *src1)
3170{
3171   dst->i[0] = src0->i[0] % src1->i[0];
3172   dst->i[1] = src0->i[1] % src1->i[1];
3173   dst->i[2] = src0->i[2] % src1->i[2];
3174   dst->i[3] = src0->i[3] % src1->i[3];
3175}
3176
3177static void
3178micro_f2i(union tgsi_exec_channel *dst,
3179          const union tgsi_exec_channel *src)
3180{
3181   dst->i[0] = (int)src->f[0];
3182   dst->i[1] = (int)src->f[1];
3183   dst->i[2] = (int)src->f[2];
3184   dst->i[3] = (int)src->f[3];
3185}
3186
3187static void
3188micro_idiv(union tgsi_exec_channel *dst,
3189           const union tgsi_exec_channel *src0,
3190           const union tgsi_exec_channel *src1)
3191{
3192   dst->i[0] = src0->i[0] / src1->i[0];
3193   dst->i[1] = src0->i[1] / src1->i[1];
3194   dst->i[2] = src0->i[2] / src1->i[2];
3195   dst->i[3] = src0->i[3] / src1->i[3];
3196}
3197
3198static void
3199micro_imax(union tgsi_exec_channel *dst,
3200           const union tgsi_exec_channel *src0,
3201           const union tgsi_exec_channel *src1)
3202{
3203   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
3204   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
3205   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
3206   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
3207}
3208
3209static void
3210micro_imin(union tgsi_exec_channel *dst,
3211           const union tgsi_exec_channel *src0,
3212           const union tgsi_exec_channel *src1)
3213{
3214   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
3215   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
3216   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
3217   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
3218}
3219
3220static void
3221micro_isge(union tgsi_exec_channel *dst,
3222           const union tgsi_exec_channel *src0,
3223           const union tgsi_exec_channel *src1)
3224{
3225   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
3226   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
3227   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
3228   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
3229}
3230
3231static void
3232micro_ishr(union tgsi_exec_channel *dst,
3233           const union tgsi_exec_channel *src0,
3234           const union tgsi_exec_channel *src1)
3235{
3236   dst->i[0] = src0->i[0] >> src1->i[0];
3237   dst->i[1] = src0->i[1] >> src1->i[1];
3238   dst->i[2] = src0->i[2] >> src1->i[2];
3239   dst->i[3] = src0->i[3] >> src1->i[3];
3240}
3241
3242static void
3243micro_islt(union tgsi_exec_channel *dst,
3244           const union tgsi_exec_channel *src0,
3245           const union tgsi_exec_channel *src1)
3246{
3247   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3248   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3249   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3250   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3251}
3252
3253static void
3254micro_f2u(union tgsi_exec_channel *dst,
3255          const union tgsi_exec_channel *src)
3256{
3257   dst->u[0] = (uint)src->f[0];
3258   dst->u[1] = (uint)src->f[1];
3259   dst->u[2] = (uint)src->f[2];
3260   dst->u[3] = (uint)src->f[3];
3261}
3262
3263static void
3264micro_u2f(union tgsi_exec_channel *dst,
3265          const union tgsi_exec_channel *src)
3266{
3267   dst->f[0] = (float)src->u[0];
3268   dst->f[1] = (float)src->u[1];
3269   dst->f[2] = (float)src->u[2];
3270   dst->f[3] = (float)src->u[3];
3271}
3272
3273static void
3274micro_uadd(union tgsi_exec_channel *dst,
3275           const union tgsi_exec_channel *src0,
3276           const union tgsi_exec_channel *src1)
3277{
3278   dst->u[0] = src0->u[0] + src1->u[0];
3279   dst->u[1] = src0->u[1] + src1->u[1];
3280   dst->u[2] = src0->u[2] + src1->u[2];
3281   dst->u[3] = src0->u[3] + src1->u[3];
3282}
3283
3284static void
3285micro_udiv(union tgsi_exec_channel *dst,
3286           const union tgsi_exec_channel *src0,
3287           const union tgsi_exec_channel *src1)
3288{
3289   dst->u[0] = src0->u[0] / src1->u[0];
3290   dst->u[1] = src0->u[1] / src1->u[1];
3291   dst->u[2] = src0->u[2] / src1->u[2];
3292   dst->u[3] = src0->u[3] / src1->u[3];
3293}
3294
3295static void
3296micro_umad(union tgsi_exec_channel *dst,
3297           const union tgsi_exec_channel *src0,
3298           const union tgsi_exec_channel *src1,
3299           const union tgsi_exec_channel *src2)
3300{
3301   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3302   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3303   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3304   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3305}
3306
3307static void
3308micro_umax(union tgsi_exec_channel *dst,
3309           const union tgsi_exec_channel *src0,
3310           const union tgsi_exec_channel *src1)
3311{
3312   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3313   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3314   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3315   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3316}
3317
3318static void
3319micro_umin(union tgsi_exec_channel *dst,
3320           const union tgsi_exec_channel *src0,
3321           const union tgsi_exec_channel *src1)
3322{
3323   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3324   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3325   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3326   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3327}
3328
3329static void
3330micro_umod(union tgsi_exec_channel *dst,
3331           const union tgsi_exec_channel *src0,
3332           const union tgsi_exec_channel *src1)
3333{
3334   dst->u[0] = src0->u[0] % src1->u[0];
3335   dst->u[1] = src0->u[1] % src1->u[1];
3336   dst->u[2] = src0->u[2] % src1->u[2];
3337   dst->u[3] = src0->u[3] % src1->u[3];
3338}
3339
3340static void
3341micro_umul(union tgsi_exec_channel *dst,
3342           const union tgsi_exec_channel *src0,
3343           const union tgsi_exec_channel *src1)
3344{
3345   dst->u[0] = src0->u[0] * src1->u[0];
3346   dst->u[1] = src0->u[1] * src1->u[1];
3347   dst->u[2] = src0->u[2] * src1->u[2];
3348   dst->u[3] = src0->u[3] * src1->u[3];
3349}
3350
3351static void
3352micro_useq(union tgsi_exec_channel *dst,
3353           const union tgsi_exec_channel *src0,
3354           const union tgsi_exec_channel *src1)
3355{
3356   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3357   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3358   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3359   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3360}
3361
3362static void
3363micro_usge(union tgsi_exec_channel *dst,
3364           const union tgsi_exec_channel *src0,
3365           const union tgsi_exec_channel *src1)
3366{
3367   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3368   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3369   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3370   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3371}
3372
3373static void
3374micro_ushr(union tgsi_exec_channel *dst,
3375           const union tgsi_exec_channel *src0,
3376           const union tgsi_exec_channel *src1)
3377{
3378   dst->u[0] = src0->u[0] >> src1->u[0];
3379   dst->u[1] = src0->u[1] >> src1->u[1];
3380   dst->u[2] = src0->u[2] >> src1->u[2];
3381   dst->u[3] = src0->u[3] >> src1->u[3];
3382}
3383
3384static void
3385micro_uslt(union tgsi_exec_channel *dst,
3386           const union tgsi_exec_channel *src0,
3387           const union tgsi_exec_channel *src1)
3388{
3389   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3390   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3391   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3392   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3393}
3394
3395static void
3396micro_usne(union tgsi_exec_channel *dst,
3397           const union tgsi_exec_channel *src0,
3398           const union tgsi_exec_channel *src1)
3399{
3400   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3401   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3402   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3403   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3404}
3405
3406static void
3407micro_uarl(union tgsi_exec_channel *dst,
3408           const union tgsi_exec_channel *src)
3409{
3410   dst->i[0] = src->u[0];
3411   dst->i[1] = src->u[1];
3412   dst->i[2] = src->u[2];
3413   dst->i[3] = src->u[3];
3414}
3415
3416static void
3417micro_ucmp(union tgsi_exec_channel *dst,
3418           const union tgsi_exec_channel *src0,
3419           const union tgsi_exec_channel *src1,
3420           const union tgsi_exec_channel *src2)
3421{
3422   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
3423   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
3424   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
3425   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
3426}
3427
3428static void
3429exec_instruction(
3430   struct tgsi_exec_machine *mach,
3431   const struct tgsi_full_instruction *inst,
3432   int *pc )
3433{
3434   union tgsi_exec_channel r[10];
3435
3436   (*pc)++;
3437
3438   switch (inst->Instruction.Opcode) {
3439   case TGSI_OPCODE_ARL:
3440      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3441      break;
3442
3443   case TGSI_OPCODE_MOV:
3444      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3445      break;
3446
3447   case TGSI_OPCODE_LIT:
3448      exec_lit(mach, inst);
3449      break;
3450
3451   case TGSI_OPCODE_RCP:
3452      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3453      break;
3454
3455   case TGSI_OPCODE_RSQ:
3456      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3457      break;
3458
3459   case TGSI_OPCODE_EXP:
3460      exec_exp(mach, inst);
3461      break;
3462
3463   case TGSI_OPCODE_LOG:
3464      exec_log(mach, inst);
3465      break;
3466
3467   case TGSI_OPCODE_MUL:
3468      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3469      break;
3470
3471   case TGSI_OPCODE_ADD:
3472      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3473      break;
3474
3475   case TGSI_OPCODE_DP3:
3476      exec_dp3(mach, inst);
3477      break;
3478
3479   case TGSI_OPCODE_DP4:
3480      exec_dp4(mach, inst);
3481      break;
3482
3483   case TGSI_OPCODE_DST:
3484      exec_dst(mach, inst);
3485      break;
3486
3487   case TGSI_OPCODE_MIN:
3488      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3489      break;
3490
3491   case TGSI_OPCODE_MAX:
3492      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3493      break;
3494
3495   case TGSI_OPCODE_SLT:
3496      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3497      break;
3498
3499   case TGSI_OPCODE_SGE:
3500      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3501      break;
3502
3503   case TGSI_OPCODE_MAD:
3504      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3505      break;
3506
3507   case TGSI_OPCODE_SUB:
3508      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3509      break;
3510
3511   case TGSI_OPCODE_LRP:
3512      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3513      break;
3514
3515   case TGSI_OPCODE_CND:
3516      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3517      break;
3518
3519   case TGSI_OPCODE_DP2A:
3520      exec_dp2a(mach, inst);
3521      break;
3522
3523   case TGSI_OPCODE_FRC:
3524      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3525      break;
3526
3527   case TGSI_OPCODE_CLAMP:
3528      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3529      break;
3530
3531   case TGSI_OPCODE_FLR:
3532      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3533      break;
3534
3535   case TGSI_OPCODE_ROUND:
3536      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3537      break;
3538
3539   case TGSI_OPCODE_EX2:
3540      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3541      break;
3542
3543   case TGSI_OPCODE_LG2:
3544      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3545      break;
3546
3547   case TGSI_OPCODE_POW:
3548      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3549      break;
3550
3551   case TGSI_OPCODE_XPD:
3552      exec_xpd(mach, inst);
3553      break;
3554
3555   case TGSI_OPCODE_ABS:
3556      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3557      break;
3558
3559   case TGSI_OPCODE_RCC:
3560      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3561      break;
3562
3563   case TGSI_OPCODE_DPH:
3564      exec_dph(mach, inst);
3565      break;
3566
3567   case TGSI_OPCODE_COS:
3568      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3569      break;
3570
3571   case TGSI_OPCODE_DDX:
3572      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3573      break;
3574
3575   case TGSI_OPCODE_DDY:
3576      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3577      break;
3578
3579   case TGSI_OPCODE_KILP:
3580      exec_kilp (mach, inst);
3581      break;
3582
3583   case TGSI_OPCODE_KIL:
3584      exec_kil (mach, inst);
3585      break;
3586
3587   case TGSI_OPCODE_PK2H:
3588      assert (0);
3589      break;
3590
3591   case TGSI_OPCODE_PK2US:
3592      assert (0);
3593      break;
3594
3595   case TGSI_OPCODE_PK4B:
3596      assert (0);
3597      break;
3598
3599   case TGSI_OPCODE_PK4UB:
3600      assert (0);
3601      break;
3602
3603   case TGSI_OPCODE_RFL:
3604      exec_rfl(mach, inst);
3605      break;
3606
3607   case TGSI_OPCODE_SEQ:
3608      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3609      break;
3610
3611   case TGSI_OPCODE_SFL:
3612      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3613      break;
3614
3615   case TGSI_OPCODE_SGT:
3616      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3617      break;
3618
3619   case TGSI_OPCODE_SIN:
3620      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3621      break;
3622
3623   case TGSI_OPCODE_SLE:
3624      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3625      break;
3626
3627   case TGSI_OPCODE_SNE:
3628      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3629      break;
3630
3631   case TGSI_OPCODE_STR:
3632      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3633      break;
3634
3635   case TGSI_OPCODE_TEX:
3636      /* simple texture lookup */
3637      /* src[0] = texcoord */
3638      /* src[1] = sampler unit */
3639      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3640      break;
3641
3642   case TGSI_OPCODE_TXB:
3643      /* Texture lookup with lod bias */
3644      /* src[0] = texcoord (src[0].w = LOD bias) */
3645      /* src[1] = sampler unit */
3646      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3647      break;
3648
3649   case TGSI_OPCODE_TXD:
3650      /* Texture lookup with explict partial derivatives */
3651      /* src[0] = texcoord */
3652      /* src[1] = d[strq]/dx */
3653      /* src[2] = d[strq]/dy */
3654      /* src[3] = sampler unit */
3655      exec_txd(mach, inst);
3656      break;
3657
3658   case TGSI_OPCODE_TXL:
3659      /* Texture lookup with explit LOD */
3660      /* src[0] = texcoord (src[0].w = LOD) */
3661      /* src[1] = sampler unit */
3662      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3663      break;
3664
3665   case TGSI_OPCODE_TXP:
3666      /* Texture lookup with projection */
3667      /* src[0] = texcoord (src[0].w = projection) */
3668      /* src[1] = sampler unit */
3669      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3670      break;
3671
3672   case TGSI_OPCODE_UP2H:
3673      assert (0);
3674      break;
3675
3676   case TGSI_OPCODE_UP2US:
3677      assert (0);
3678      break;
3679
3680   case TGSI_OPCODE_UP4B:
3681      assert (0);
3682      break;
3683
3684   case TGSI_OPCODE_UP4UB:
3685      assert (0);
3686      break;
3687
3688   case TGSI_OPCODE_X2D:
3689      exec_x2d(mach, inst);
3690      break;
3691
3692   case TGSI_OPCODE_ARA:
3693      assert (0);
3694      break;
3695
3696   case TGSI_OPCODE_ARR:
3697      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3698      break;
3699
3700   case TGSI_OPCODE_BRA:
3701      assert (0);
3702      break;
3703
3704   case TGSI_OPCODE_CAL:
3705      /* skip the call if no execution channels are enabled */
3706      if (mach->ExecMask) {
3707         /* do the call */
3708
3709         /* First, record the depths of the execution stacks.
3710          * This is important for deeply nested/looped return statements.
3711          * We have to unwind the stacks by the correct amount.  For a
3712          * real code generator, we could determine the number of entries
3713          * to pop off each stack with simple static analysis and avoid
3714          * implementing this data structure at run time.
3715          */
3716         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3717         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3718         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3719         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3720         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3721         /* note that PC was already incremented above */
3722         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3723
3724         mach->CallStackTop++;
3725
3726         /* Second, push the Cond, Loop, Cont, Func stacks */
3727         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3728         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3729         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3730         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3731         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3732         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3733
3734         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3735         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3736         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3737         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3738         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3739         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3740
3741         /* Finally, jump to the subroutine */
3742         *pc = inst->Label.Label;
3743      }
3744      break;
3745
3746   case TGSI_OPCODE_RET:
3747      mach->FuncMask &= ~mach->ExecMask;
3748      UPDATE_EXEC_MASK(mach);
3749
3750      if (mach->FuncMask == 0x0) {
3751         /* really return now (otherwise, keep executing */
3752
3753         if (mach->CallStackTop == 0) {
3754            /* returning from main() */
3755            mach->CondStackTop = 0;
3756            mach->LoopStackTop = 0;
3757            *pc = -1;
3758            return;
3759         }
3760
3761         assert(mach->CallStackTop > 0);
3762         mach->CallStackTop--;
3763
3764         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3765         mach->CondMask = mach->CondStack[mach->CondStackTop];
3766
3767         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3768         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3769
3770         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3771         mach->ContMask = mach->ContStack[mach->ContStackTop];
3772
3773         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3774         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3775
3776         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3777         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3778
3779         assert(mach->FuncStackTop > 0);
3780         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3781
3782         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3783
3784         UPDATE_EXEC_MASK(mach);
3785      }
3786      break;
3787
3788   case TGSI_OPCODE_SSG:
3789      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3790      break;
3791
3792   case TGSI_OPCODE_CMP:
3793      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3794      break;
3795
3796   case TGSI_OPCODE_SCS:
3797      exec_scs(mach, inst);
3798      break;
3799
3800   case TGSI_OPCODE_NRM:
3801      exec_nrm3(mach, inst);
3802      break;
3803
3804   case TGSI_OPCODE_NRM4:
3805      exec_nrm4(mach, inst);
3806      break;
3807
3808   case TGSI_OPCODE_DIV:
3809      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3810      break;
3811
3812   case TGSI_OPCODE_DP2:
3813      exec_dp2(mach, inst);
3814      break;
3815
3816   case TGSI_OPCODE_IF:
3817      /* push CondMask */
3818      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3819      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3820      FETCH( &r[0], 0, CHAN_X );
3821      /* update CondMask */
3822      if( ! r[0].u[0] ) {
3823         mach->CondMask &= ~0x1;
3824      }
3825      if( ! r[0].u[1] ) {
3826         mach->CondMask &= ~0x2;
3827      }
3828      if( ! r[0].u[2] ) {
3829         mach->CondMask &= ~0x4;
3830      }
3831      if( ! r[0].u[3] ) {
3832         mach->CondMask &= ~0x8;
3833      }
3834      UPDATE_EXEC_MASK(mach);
3835      /* Todo: If CondMask==0, jump to ELSE */
3836      break;
3837
3838   case TGSI_OPCODE_ELSE:
3839      /* invert CondMask wrt previous mask */
3840      {
3841         uint prevMask;
3842         assert(mach->CondStackTop > 0);
3843         prevMask = mach->CondStack[mach->CondStackTop - 1];
3844         mach->CondMask = ~mach->CondMask & prevMask;
3845         UPDATE_EXEC_MASK(mach);
3846         /* Todo: If CondMask==0, jump to ENDIF */
3847      }
3848      break;
3849
3850   case TGSI_OPCODE_ENDIF:
3851      /* pop CondMask */
3852      assert(mach->CondStackTop > 0);
3853      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3854      UPDATE_EXEC_MASK(mach);
3855      break;
3856
3857   case TGSI_OPCODE_END:
3858      /* make sure we end primitives which haven't
3859       * been explicitly emitted */
3860      conditional_emit_primitive(mach);
3861      /* halt execution */
3862      *pc = -1;
3863      break;
3864
3865   case TGSI_OPCODE_PUSHA:
3866      assert (0);
3867      break;
3868
3869   case TGSI_OPCODE_POPA:
3870      assert (0);
3871      break;
3872
3873   case TGSI_OPCODE_CEIL:
3874      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3875      break;
3876
3877   case TGSI_OPCODE_I2F:
3878      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3879      break;
3880
3881   case TGSI_OPCODE_NOT:
3882      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3883      break;
3884
3885   case TGSI_OPCODE_TRUNC:
3886      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3887      break;
3888
3889   case TGSI_OPCODE_SHL:
3890      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3891      break;
3892
3893   case TGSI_OPCODE_AND:
3894      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3895      break;
3896
3897   case TGSI_OPCODE_OR:
3898      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3899      break;
3900
3901   case TGSI_OPCODE_MOD:
3902      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3903      break;
3904
3905   case TGSI_OPCODE_XOR:
3906      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3907      break;
3908
3909   case TGSI_OPCODE_SAD:
3910      assert (0);
3911      break;
3912
3913   case TGSI_OPCODE_TXF:
3914      exec_txf(mach, inst);
3915      break;
3916
3917   case TGSI_OPCODE_TXQ:
3918      exec_txq(mach, inst);
3919      break;
3920
3921   case TGSI_OPCODE_EMIT:
3922      emit_vertex(mach);
3923      break;
3924
3925   case TGSI_OPCODE_ENDPRIM:
3926      emit_primitive(mach);
3927      break;
3928
3929   case TGSI_OPCODE_BGNLOOP:
3930      /* push LoopMask and ContMasks */
3931      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3932      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3933      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3934      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3935
3936      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3937      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3938      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3939      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3940      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3941      break;
3942
3943   case TGSI_OPCODE_ENDLOOP:
3944      /* Restore ContMask, but don't pop */
3945      assert(mach->ContStackTop > 0);
3946      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3947      UPDATE_EXEC_MASK(mach);
3948      if (mach->ExecMask) {
3949         /* repeat loop: jump to instruction just past BGNLOOP */
3950         assert(mach->LoopLabelStackTop > 0);
3951         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3952      }
3953      else {
3954         /* exit loop: pop LoopMask */
3955         assert(mach->LoopStackTop > 0);
3956         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3957         /* pop ContMask */
3958         assert(mach->ContStackTop > 0);
3959         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3960         assert(mach->LoopLabelStackTop > 0);
3961         --mach->LoopLabelStackTop;
3962
3963         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3964      }
3965      UPDATE_EXEC_MASK(mach);
3966      break;
3967
3968   case TGSI_OPCODE_BRK:
3969      exec_break(mach);
3970      break;
3971
3972   case TGSI_OPCODE_CONT:
3973      /* turn off cont channels for each enabled exec channel */
3974      mach->ContMask &= ~mach->ExecMask;
3975      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3976      UPDATE_EXEC_MASK(mach);
3977      break;
3978
3979   case TGSI_OPCODE_BGNSUB:
3980      /* no-op */
3981      break;
3982
3983   case TGSI_OPCODE_ENDSUB:
3984      /*
3985       * XXX: This really should be a no-op. We should never reach this opcode.
3986       */
3987
3988      assert(mach->CallStackTop > 0);
3989      mach->CallStackTop--;
3990
3991      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3992      mach->CondMask = mach->CondStack[mach->CondStackTop];
3993
3994      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3995      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3996
3997      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3998      mach->ContMask = mach->ContStack[mach->ContStackTop];
3999
4000      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
4001      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
4002
4003      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
4004      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
4005
4006      assert(mach->FuncStackTop > 0);
4007      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
4008
4009      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
4010
4011      UPDATE_EXEC_MASK(mach);
4012      break;
4013
4014   case TGSI_OPCODE_NOP:
4015      break;
4016
4017   case TGSI_OPCODE_BREAKC:
4018      FETCH(&r[0], 0, CHAN_X);
4019      /* update CondMask */
4020      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
4021         mach->LoopMask &= ~0x1;
4022      }
4023      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
4024         mach->LoopMask &= ~0x2;
4025      }
4026      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
4027         mach->LoopMask &= ~0x4;
4028      }
4029      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
4030         mach->LoopMask &= ~0x8;
4031      }
4032      /* Todo: if mach->LoopMask == 0, jump to end of loop */
4033      UPDATE_EXEC_MASK(mach);
4034      break;
4035
4036   case TGSI_OPCODE_F2I:
4037      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
4038      break;
4039
4040   case TGSI_OPCODE_IDIV:
4041      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4042      break;
4043
4044   case TGSI_OPCODE_IMAX:
4045      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4046      break;
4047
4048   case TGSI_OPCODE_IMIN:
4049      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4050      break;
4051
4052   case TGSI_OPCODE_INEG:
4053      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4054      break;
4055
4056   case TGSI_OPCODE_ISGE:
4057      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4058      break;
4059
4060   case TGSI_OPCODE_ISHR:
4061      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4062      break;
4063
4064   case TGSI_OPCODE_ISLT:
4065      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4066      break;
4067
4068   case TGSI_OPCODE_F2U:
4069      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
4070      break;
4071
4072   case TGSI_OPCODE_U2F:
4073      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
4074      break;
4075
4076   case TGSI_OPCODE_UADD:
4077      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4078      break;
4079
4080   case TGSI_OPCODE_UDIV:
4081      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4082      break;
4083
4084   case TGSI_OPCODE_UMAD:
4085      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4086      break;
4087
4088   case TGSI_OPCODE_UMAX:
4089      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4090      break;
4091
4092   case TGSI_OPCODE_UMIN:
4093      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4094      break;
4095
4096   case TGSI_OPCODE_UMOD:
4097      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4098      break;
4099
4100   case TGSI_OPCODE_UMUL:
4101      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4102      break;
4103
4104   case TGSI_OPCODE_USEQ:
4105      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4106      break;
4107
4108   case TGSI_OPCODE_USGE:
4109      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4110      break;
4111
4112   case TGSI_OPCODE_USHR:
4113      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4114      break;
4115
4116   case TGSI_OPCODE_USLT:
4117      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4118      break;
4119
4120   case TGSI_OPCODE_USNE:
4121      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4122      break;
4123
4124   case TGSI_OPCODE_SWITCH:
4125      exec_switch(mach, inst);
4126      break;
4127
4128   case TGSI_OPCODE_CASE:
4129      exec_case(mach, inst);
4130      break;
4131
4132   case TGSI_OPCODE_DEFAULT:
4133      exec_default(mach);
4134      break;
4135
4136   case TGSI_OPCODE_ENDSWITCH:
4137      exec_endswitch(mach);
4138      break;
4139
4140   case TGSI_OPCODE_LOAD:
4141      assert(0);
4142      break;
4143
4144   case TGSI_OPCODE_LOAD_MS:
4145      assert(0);
4146      break;
4147
4148   case TGSI_OPCODE_SAMPLE:
4149      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4150      break;
4151
4152   case TGSI_OPCODE_SAMPLE_B:
4153      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4154      break;
4155
4156   case TGSI_OPCODE_SAMPLE_C:
4157      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4158      break;
4159
4160   case TGSI_OPCODE_SAMPLE_C_LZ:
4161      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4162      break;
4163
4164   case TGSI_OPCODE_SAMPLE_D:
4165      exec_sample_d(mach, inst);
4166      break;
4167
4168   case TGSI_OPCODE_SAMPLE_L:
4169      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
4170      break;
4171
4172   case TGSI_OPCODE_GATHER4:
4173      assert(0);
4174      break;
4175
4176   case TGSI_OPCODE_RESINFO:
4177      assert(0);
4178      break;
4179
4180   case TGSI_OPCODE_SAMPLE_POS:
4181      assert(0);
4182      break;
4183
4184   case TGSI_OPCODE_SAMPLE_INFO:
4185      assert(0);
4186      break;
4187
4188   case TGSI_OPCODE_UARL:
4189      exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
4190      break;
4191
4192   case TGSI_OPCODE_UCMP:
4193      exec_vector_trinary(mach, inst, micro_ucmp, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4194      break;
4195
4196   case TGSI_OPCODE_IABS:
4197      exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4198      break;
4199
4200   default:
4201      assert( 0 );
4202   }
4203}
4204
4205
4206#define DEBUG_EXECUTION 0
4207
4208
4209/**
4210 * Run TGSI interpreter.
4211 * \return bitmask of "alive" quad components
4212 */
4213uint
4214tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
4215{
4216   uint i;
4217   int pc = 0;
4218
4219   mach->CondMask = 0xf;
4220   mach->LoopMask = 0xf;
4221   mach->ContMask = 0xf;
4222   mach->FuncMask = 0xf;
4223   mach->ExecMask = 0xf;
4224
4225   mach->Switch.mask = 0xf;
4226
4227   assert(mach->CondStackTop == 0);
4228   assert(mach->LoopStackTop == 0);
4229   assert(mach->ContStackTop == 0);
4230   assert(mach->SwitchStackTop == 0);
4231   assert(mach->BreakStackTop == 0);
4232   assert(mach->CallStackTop == 0);
4233
4234   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
4235   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
4236
4237   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
4238      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
4239      mach->Primitives[0] = 0;
4240   }
4241
4242   /* execute declarations (interpolants) */
4243   for (i = 0; i < mach->NumDeclarations; i++) {
4244      exec_declaration( mach, mach->Declarations+i );
4245   }
4246
4247   {
4248#if DEBUG_EXECUTION
4249      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
4250      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
4251      uint inst = 1;
4252
4253      memcpy(temps, mach->Temps, sizeof(temps));
4254      memcpy(outputs, mach->Outputs, sizeof(outputs));
4255#endif
4256
4257      /* execute instructions, until pc is set to -1 */
4258      while (pc != -1) {
4259
4260#if DEBUG_EXECUTION
4261         uint i;
4262
4263         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
4264#endif
4265
4266         assert(pc < (int) mach->NumInstructions);
4267         exec_instruction(mach, mach->Instructions + pc, &pc);
4268
4269#if DEBUG_EXECUTION
4270         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4271            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4272               uint j;
4273
4274               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4275               debug_printf("TEMP[%2u] = ", i);
4276               for (j = 0; j < 4; j++) {
4277                  if (j > 0) {
4278                     debug_printf("           ");
4279                  }
4280                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4281                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4282                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4283                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4284                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4285               }
4286            }
4287         }
4288         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4289            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4290               uint j;
4291
4292               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4293               debug_printf("OUT[%2u] =  ", i);
4294               for (j = 0; j < 4; j++) {
4295                  if (j > 0) {
4296                     debug_printf("           ");
4297                  }
4298                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4299                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4300                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4301                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4302                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4303               }
4304            }
4305         }
4306#endif
4307      }
4308   }
4309
4310#if 0
4311   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4312   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4313      /*
4314       * Scale back depth component.
4315       */
4316      for (i = 0; i < 4; i++)
4317         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4318   }
4319#endif
4320
4321   /* Strictly speaking, these assertions aren't really needed but they
4322    * can potentially catch some bugs in the control flow code.
4323    */
4324   assert(mach->CondStackTop == 0);
4325   assert(mach->LoopStackTop == 0);
4326   assert(mach->ContStackTop == 0);
4327   assert(mach->SwitchStackTop == 0);
4328   assert(mach->BreakStackTop == 0);
4329   assert(mach->CallStackTop == 0);
4330
4331   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4332}
4333