tgsi_exec.c revision 34a78b7ef6b0edf217acf221eab4b63542be5552
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 0
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_sgt(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src0,
380          const union tgsi_exec_channel *src1)
381{
382   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
383   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
384   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
385   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
386}
387
388static void
389micro_sin(union tgsi_exec_channel *dst,
390          const union tgsi_exec_channel *src)
391{
392   dst->f[0] = sinf(src->f[0]);
393   dst->f[1] = sinf(src->f[1]);
394   dst->f[2] = sinf(src->f[2]);
395   dst->f[3] = sinf(src->f[3]);
396}
397
398static void
399micro_sle(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src0,
401          const union tgsi_exec_channel *src1)
402{
403   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
404   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
405   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
406   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
407}
408
409static void
410micro_slt(union tgsi_exec_channel *dst,
411          const union tgsi_exec_channel *src0,
412          const union tgsi_exec_channel *src1)
413{
414   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
415   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
416   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
417   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
418}
419
420static void
421micro_sne(union tgsi_exec_channel *dst,
422          const union tgsi_exec_channel *src0,
423          const union tgsi_exec_channel *src1)
424{
425   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
426   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
427   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
428   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
429}
430
431static void
432micro_sfl(union tgsi_exec_channel *dst)
433{
434   dst->f[0] = 0.0f;
435   dst->f[1] = 0.0f;
436   dst->f[2] = 0.0f;
437   dst->f[3] = 0.0f;
438}
439
440static void
441micro_str(union tgsi_exec_channel *dst)
442{
443   dst->f[0] = 1.0f;
444   dst->f[1] = 1.0f;
445   dst->f[2] = 1.0f;
446   dst->f[3] = 1.0f;
447}
448
449static void
450micro_trunc(union tgsi_exec_channel *dst,
451            const union tgsi_exec_channel *src)
452{
453   dst->f[0] = (float)(int)src->f[0];
454   dst->f[1] = (float)(int)src->f[1];
455   dst->f[2] = (float)(int)src->f[2];
456   dst->f[3] = (float)(int)src->f[3];
457}
458
459
460#define CHAN_X  0
461#define CHAN_Y  1
462#define CHAN_Z  2
463#define CHAN_W  3
464
465enum tgsi_exec_datatype {
466   TGSI_EXEC_DATA_FLOAT,
467   TGSI_EXEC_DATA_INT,
468   TGSI_EXEC_DATA_UINT
469};
470
471/*
472 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
473 */
474#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
475#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
476#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
477#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
478#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
479#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
480
481
482/** The execution mask depends on the conditional mask and the loop mask */
483#define UPDATE_EXEC_MASK(MACH) \
484      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
485
486
487static const union tgsi_exec_channel ZeroVec =
488   { { 0.0, 0.0, 0.0, 0.0 } };
489
490static const union tgsi_exec_channel OneVec = {
491   {1.0f, 1.0f, 1.0f, 1.0f}
492};
493
494static const union tgsi_exec_channel P128Vec = {
495   {128.0f, 128.0f, 128.0f, 128.0f}
496};
497
498static const union tgsi_exec_channel M128Vec = {
499   {-128.0f, -128.0f, -128.0f, -128.0f}
500};
501
502
503/**
504 * Assert that none of the float values in 'chan' are infinite or NaN.
505 * NaN and Inf may occur normally during program execution and should
506 * not lead to crashes, etc.  But when debugging, it's helpful to catch
507 * them.
508 */
509static INLINE void
510check_inf_or_nan(const union tgsi_exec_channel *chan)
511{
512   assert(!util_is_inf_or_nan((chan)->f[0]));
513   assert(!util_is_inf_or_nan((chan)->f[1]));
514   assert(!util_is_inf_or_nan((chan)->f[2]));
515   assert(!util_is_inf_or_nan((chan)->f[3]));
516}
517
518
519#ifdef DEBUG
520static void
521print_chan(const char *msg, const union tgsi_exec_channel *chan)
522{
523   debug_printf("%s = {%f, %f, %f, %f}\n",
524                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
525}
526#endif
527
528
529#ifdef DEBUG
530static void
531print_temp(const struct tgsi_exec_machine *mach, uint index)
532{
533   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
534   int i;
535   debug_printf("Temp[%u] =\n", index);
536   for (i = 0; i < 4; i++) {
537      debug_printf("  %c: { %f, %f, %f, %f }\n",
538                   "XYZW"[i],
539                   tmp->xyzw[i].f[0],
540                   tmp->xyzw[i].f[1],
541                   tmp->xyzw[i].f[2],
542                   tmp->xyzw[i].f[3]);
543   }
544}
545#endif
546
547
548void
549tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
550                               unsigned num_bufs,
551                               const void **bufs,
552                               const unsigned *buf_sizes)
553{
554   unsigned i;
555
556   for (i = 0; i < num_bufs; i++) {
557      mach->Consts[i] = bufs[i];
558      mach->ConstsSize[i] = buf_sizes[i];
559   }
560}
561
562
563/**
564 * Check if there's a potential src/dst register data dependency when
565 * using SOA execution.
566 * Example:
567 *   MOV T, T.yxwz;
568 * This would expand into:
569 *   MOV t0, t1;
570 *   MOV t1, t0;
571 *   MOV t2, t3;
572 *   MOV t3, t2;
573 * The second instruction will have the wrong value for t0 if executed as-is.
574 */
575boolean
576tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
577{
578   uint i, chan;
579
580   uint writemask = inst->Dst[0].Register.WriteMask;
581   if (writemask == TGSI_WRITEMASK_X ||
582       writemask == TGSI_WRITEMASK_Y ||
583       writemask == TGSI_WRITEMASK_Z ||
584       writemask == TGSI_WRITEMASK_W ||
585       writemask == TGSI_WRITEMASK_NONE) {
586      /* no chance of data dependency */
587      return FALSE;
588   }
589
590   /* loop over src regs */
591   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
592      if ((inst->Src[i].Register.File ==
593           inst->Dst[0].Register.File) &&
594          ((inst->Src[i].Register.Index ==
595            inst->Dst[0].Register.Index) ||
596           inst->Src[i].Register.Indirect ||
597           inst->Dst[0].Register.Indirect)) {
598         /* loop over dest channels */
599         uint channelsWritten = 0x0;
600         for (chan = 0; chan < NUM_CHANNELS; chan++) {
601            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
602               /* check if we're reading a channel that's been written */
603               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
604               if (channelsWritten & (1 << swizzle)) {
605                  return TRUE;
606               }
607
608               channelsWritten |= (1 << chan);
609            }
610         }
611      }
612   }
613   return FALSE;
614}
615
616
617/**
618 * Initialize machine state by expanding tokens to full instructions,
619 * allocating temporary storage, setting up constants, etc.
620 * After this, we can call tgsi_exec_machine_run() many times.
621 */
622void
623tgsi_exec_machine_bind_shader(
624   struct tgsi_exec_machine *mach,
625   const struct tgsi_token *tokens,
626   uint numSamplers,
627   struct tgsi_sampler **samplers)
628{
629   uint k;
630   struct tgsi_parse_context parse;
631   struct tgsi_full_instruction *instructions;
632   struct tgsi_full_declaration *declarations;
633   uint maxInstructions = 10, numInstructions = 0;
634   uint maxDeclarations = 10, numDeclarations = 0;
635
636#if 0
637   tgsi_dump(tokens, 0);
638#endif
639
640   util_init_math();
641
642   if (numSamplers) {
643      assert(samplers);
644   }
645
646   mach->Tokens = tokens;
647   mach->Samplers = samplers;
648
649   if (!tokens) {
650      /* unbind and free all */
651      if (mach->Declarations) {
652         FREE( mach->Declarations );
653      }
654      mach->Declarations = NULL;
655      mach->NumDeclarations = 0;
656
657      if (mach->Instructions) {
658         FREE( mach->Instructions );
659      }
660      mach->Instructions = NULL;
661      mach->NumInstructions = 0;
662
663      return;
664   }
665
666   k = tgsi_parse_init (&parse, mach->Tokens);
667   if (k != TGSI_PARSE_OK) {
668      debug_printf( "Problem parsing!\n" );
669      return;
670   }
671
672   mach->Processor = parse.FullHeader.Processor.Processor;
673   mach->ImmLimit = 0;
674
675   if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
676       !mach->UsedGeometryShader) {
677      struct tgsi_exec_vector *inputs;
678      struct tgsi_exec_vector *outputs;
679
680      inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
681                            TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
682                            16);
683
684      if (!inputs)
685         return;
686
687      outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
688                             TGSI_MAX_TOTAL_VERTICES, 16);
689
690      if (!outputs) {
691         align_free(inputs);
692         return;
693      }
694
695      align_free(mach->Inputs);
696      align_free(mach->Outputs);
697
698      mach->Inputs = inputs;
699      mach->Outputs = outputs;
700      mach->UsedGeometryShader = TRUE;
701   }
702
703   declarations = (struct tgsi_full_declaration *)
704      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
705
706   if (!declarations) {
707      return;
708   }
709
710   instructions = (struct tgsi_full_instruction *)
711      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
712
713   if (!instructions) {
714      FREE( declarations );
715      return;
716   }
717
718   while( !tgsi_parse_end_of_tokens( &parse ) ) {
719      uint i;
720
721      tgsi_parse_token( &parse );
722      switch( parse.FullToken.Token.Type ) {
723      case TGSI_TOKEN_TYPE_DECLARATION:
724         /* save expanded declaration */
725         if (numDeclarations == maxDeclarations) {
726            declarations = REALLOC(declarations,
727                                   maxDeclarations
728                                   * sizeof(struct tgsi_full_declaration),
729                                   (maxDeclarations + 10)
730                                   * sizeof(struct tgsi_full_declaration));
731            maxDeclarations += 10;
732         }
733         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
734            unsigned reg;
735            for (reg = parse.FullToken.FullDeclaration.Range.First;
736                 reg <= parse.FullToken.FullDeclaration.Range.Last;
737                 ++reg) {
738               ++mach->NumOutputs;
739            }
740         }
741         if (parse.FullToken.FullDeclaration.Declaration.File ==
742             TGSI_FILE_IMMEDIATE_ARRAY) {
743            unsigned reg;
744            struct tgsi_full_declaration *decl =
745               &parse.FullToken.FullDeclaration;
746            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
747            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
748               for( i = 0; i < 4; i++ ) {
749                  int idx = reg * 4 + i;
750                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
751               }
752            }
753         }
754         memcpy(declarations + numDeclarations,
755                &parse.FullToken.FullDeclaration,
756                sizeof(declarations[0]));
757         numDeclarations++;
758         break;
759
760      case TGSI_TOKEN_TYPE_IMMEDIATE:
761         {
762            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
763            assert( size <= 4 );
764            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
765
766            for( i = 0; i < size; i++ ) {
767               mach->Imms[mach->ImmLimit][i] =
768		  parse.FullToken.FullImmediate.u[i].Float;
769            }
770            mach->ImmLimit += 1;
771         }
772         break;
773
774      case TGSI_TOKEN_TYPE_INSTRUCTION:
775
776         /* save expanded instruction */
777         if (numInstructions == maxInstructions) {
778            instructions = REALLOC(instructions,
779                                   maxInstructions
780                                   * sizeof(struct tgsi_full_instruction),
781                                   (maxInstructions + 10)
782                                   * sizeof(struct tgsi_full_instruction));
783            maxInstructions += 10;
784         }
785
786         memcpy(instructions + numInstructions,
787                &parse.FullToken.FullInstruction,
788                sizeof(instructions[0]));
789
790         numInstructions++;
791         break;
792
793      case TGSI_TOKEN_TYPE_PROPERTY:
794         break;
795
796      default:
797         assert( 0 );
798      }
799   }
800   tgsi_parse_free (&parse);
801
802   if (mach->Declarations) {
803      FREE( mach->Declarations );
804   }
805   mach->Declarations = declarations;
806   mach->NumDeclarations = numDeclarations;
807
808   if (mach->Instructions) {
809      FREE( mach->Instructions );
810   }
811   mach->Instructions = instructions;
812   mach->NumInstructions = numInstructions;
813}
814
815
816struct tgsi_exec_machine *
817tgsi_exec_machine_create( void )
818{
819   struct tgsi_exec_machine *mach;
820   uint i;
821
822   mach = align_malloc( sizeof *mach, 16 );
823   if (!mach)
824      goto fail;
825
826   memset(mach, 0, sizeof(*mach));
827
828   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
829   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
830   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
831
832   mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
833   mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
834   if (!mach->Inputs || !mach->Outputs)
835      goto fail;
836
837   /* Setup constants needed by the SSE2 executor. */
838   for( i = 0; i < 4; i++ ) {
839      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
840      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
841      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
842      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
843      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
844      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
845      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
846      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
847      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
848      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
849   }
850
851#ifdef DEBUG
852   /* silence warnings */
853   (void) print_chan;
854   (void) print_temp;
855#endif
856
857   return mach;
858
859fail:
860   if (mach) {
861      align_free(mach->Inputs);
862      align_free(mach->Outputs);
863      align_free(mach);
864   }
865   return NULL;
866}
867
868
869void
870tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
871{
872   if (mach) {
873      if (mach->Instructions)
874         FREE(mach->Instructions);
875      if (mach->Declarations)
876         FREE(mach->Declarations);
877
878      align_free(mach->Inputs);
879      align_free(mach->Outputs);
880
881      align_free(mach);
882   }
883}
884
885static void
886micro_add(union tgsi_exec_channel *dst,
887          const union tgsi_exec_channel *src0,
888          const union tgsi_exec_channel *src1)
889{
890   dst->f[0] = src0->f[0] + src1->f[0];
891   dst->f[1] = src0->f[1] + src1->f[1];
892   dst->f[2] = src0->f[2] + src1->f[2];
893   dst->f[3] = src0->f[3] + src1->f[3];
894}
895
896static void
897micro_div(
898   union tgsi_exec_channel *dst,
899   const union tgsi_exec_channel *src0,
900   const union tgsi_exec_channel *src1 )
901{
902   if (src1->f[0] != 0) {
903      dst->f[0] = src0->f[0] / src1->f[0];
904   }
905   if (src1->f[1] != 0) {
906      dst->f[1] = src0->f[1] / src1->f[1];
907   }
908   if (src1->f[2] != 0) {
909      dst->f[2] = src0->f[2] / src1->f[2];
910   }
911   if (src1->f[3] != 0) {
912      dst->f[3] = src0->f[3] / src1->f[3];
913   }
914}
915
916static void
917micro_rcc(union tgsi_exec_channel *dst,
918          const union tgsi_exec_channel *src)
919{
920   uint i;
921
922   for (i = 0; i < 4; i++) {
923      float recip = 1.0f / src->f[i];
924
925      if (recip > 0.0f) {
926         if (recip > 1.884467e+019f) {
927            dst->f[i] = 1.884467e+019f;
928         }
929         else if (recip < 5.42101e-020f) {
930            dst->f[i] = 5.42101e-020f;
931         }
932         else {
933            dst->f[i] = recip;
934         }
935      }
936      else {
937         if (recip < -1.884467e+019f) {
938            dst->f[i] = -1.884467e+019f;
939         }
940         else if (recip > -5.42101e-020f) {
941            dst->f[i] = -5.42101e-020f;
942         }
943         else {
944            dst->f[i] = recip;
945         }
946      }
947   }
948}
949
950static void
951micro_lt(
952   union tgsi_exec_channel *dst,
953   const union tgsi_exec_channel *src0,
954   const union tgsi_exec_channel *src1,
955   const union tgsi_exec_channel *src2,
956   const union tgsi_exec_channel *src3 )
957{
958   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
959   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
960   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
961   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
962}
963
964static void
965micro_max(union tgsi_exec_channel *dst,
966          const union tgsi_exec_channel *src0,
967          const union tgsi_exec_channel *src1)
968{
969   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
970   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
971   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
972   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
973}
974
975static void
976micro_min(union tgsi_exec_channel *dst,
977          const union tgsi_exec_channel *src0,
978          const union tgsi_exec_channel *src1)
979{
980   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
981   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
982   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
983   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
984}
985
986static void
987micro_mul(union tgsi_exec_channel *dst,
988          const union tgsi_exec_channel *src0,
989          const union tgsi_exec_channel *src1)
990{
991   dst->f[0] = src0->f[0] * src1->f[0];
992   dst->f[1] = src0->f[1] * src1->f[1];
993   dst->f[2] = src0->f[2] * src1->f[2];
994   dst->f[3] = src0->f[3] * src1->f[3];
995}
996
997static void
998micro_neg(
999   union tgsi_exec_channel *dst,
1000   const union tgsi_exec_channel *src )
1001{
1002   dst->f[0] = -src->f[0];
1003   dst->f[1] = -src->f[1];
1004   dst->f[2] = -src->f[2];
1005   dst->f[3] = -src->f[3];
1006}
1007
1008static void
1009micro_pow(
1010   union tgsi_exec_channel *dst,
1011   const union tgsi_exec_channel *src0,
1012   const union tgsi_exec_channel *src1 )
1013{
1014#if FAST_MATH
1015   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1016   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1017   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1018   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1019#else
1020   dst->f[0] = powf( src0->f[0], src1->f[0] );
1021   dst->f[1] = powf( src0->f[1], src1->f[1] );
1022   dst->f[2] = powf( src0->f[2], src1->f[2] );
1023   dst->f[3] = powf( src0->f[3], src1->f[3] );
1024#endif
1025}
1026
1027static void
1028micro_sub(union tgsi_exec_channel *dst,
1029          const union tgsi_exec_channel *src0,
1030          const union tgsi_exec_channel *src1)
1031{
1032   dst->f[0] = src0->f[0] - src1->f[0];
1033   dst->f[1] = src0->f[1] - src1->f[1];
1034   dst->f[2] = src0->f[2] - src1->f[2];
1035   dst->f[3] = src0->f[3] - src1->f[3];
1036}
1037
1038static void
1039fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1040                       const uint chan_index,
1041                       const uint file,
1042                       const uint swizzle,
1043                       const union tgsi_exec_channel *index,
1044                       const union tgsi_exec_channel *index2D,
1045                       union tgsi_exec_channel *chan)
1046{
1047   uint i;
1048
1049   assert(swizzle < 4);
1050
1051   switch (file) {
1052   case TGSI_FILE_CONSTANT:
1053      for (i = 0; i < QUAD_SIZE; i++) {
1054         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1055         assert(mach->Consts[index2D->i[i]]);
1056
1057         if (index->i[i] < 0) {
1058            chan->u[i] = 0;
1059         } else {
1060            /* NOTE: copying the const value as a uint instead of float */
1061            const uint constbuf = index2D->i[i];
1062            const uint *buf = (const uint *)mach->Consts[constbuf];
1063            const int pos = index->i[i] * 4 + swizzle;
1064            /* const buffer bounds check */
1065            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1066               if (0) {
1067                  /* Debug: print warning */
1068                  static int count = 0;
1069                  if (count++ < 100)
1070                     debug_printf("TGSI Exec: const buffer index %d"
1071                                  " out of bounds\n", pos);
1072               }
1073               chan->u[i] = 0;
1074            }
1075            else
1076               chan->u[i] = buf[pos];
1077         }
1078      }
1079      break;
1080
1081   case TGSI_FILE_INPUT:
1082      for (i = 0; i < QUAD_SIZE; i++) {
1083         /*
1084         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1085            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1086                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1087                         index2D->i[i], index->i[i]);
1088                         }*/
1089         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1090         assert(pos >= 0);
1091         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1092         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1093      }
1094      break;
1095
1096   case TGSI_FILE_SYSTEM_VALUE:
1097      /* XXX no swizzling at this point.  Will be needed if we put
1098       * gl_FragCoord, for example, in a sys value register.
1099       */
1100      for (i = 0; i < QUAD_SIZE; i++) {
1101         chan->u[i] = mach->SystemValue[index->i[i]].u[i];
1102      }
1103      break;
1104
1105   case TGSI_FILE_TEMPORARY:
1106      for (i = 0; i < QUAD_SIZE; i++) {
1107         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1108         assert(index2D->i[i] == 0);
1109
1110         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1111      }
1112      break;
1113
1114   case TGSI_FILE_TEMPORARY_ARRAY:
1115      for (i = 0; i < QUAD_SIZE; i++) {
1116         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1117         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1118
1119         chan->u[i] =
1120            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1121      }
1122      break;
1123
1124   case TGSI_FILE_IMMEDIATE:
1125      for (i = 0; i < QUAD_SIZE; i++) {
1126         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1127         assert(index2D->i[i] == 0);
1128
1129         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1130      }
1131      break;
1132
1133   case TGSI_FILE_IMMEDIATE_ARRAY:
1134      for (i = 0; i < QUAD_SIZE; i++) {
1135         assert(index2D->i[i] == 0);
1136
1137         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1138      }
1139      break;
1140
1141   case TGSI_FILE_ADDRESS:
1142      for (i = 0; i < QUAD_SIZE; i++) {
1143         assert(index->i[i] >= 0);
1144         assert(index2D->i[i] == 0);
1145
1146         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1147      }
1148      break;
1149
1150   case TGSI_FILE_PREDICATE:
1151      for (i = 0; i < QUAD_SIZE; i++) {
1152         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1153         assert(index2D->i[i] == 0);
1154
1155         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1156      }
1157      break;
1158
1159   case TGSI_FILE_OUTPUT:
1160      /* vertex/fragment output vars can be read too */
1161      for (i = 0; i < QUAD_SIZE; i++) {
1162         assert(index->i[i] >= 0);
1163         assert(index2D->i[i] == 0);
1164
1165         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1166      }
1167      break;
1168
1169   default:
1170      assert(0);
1171      for (i = 0; i < QUAD_SIZE; i++) {
1172         chan->u[i] = 0;
1173      }
1174   }
1175}
1176
1177static void
1178fetch_source(const struct tgsi_exec_machine *mach,
1179             union tgsi_exec_channel *chan,
1180             const struct tgsi_full_src_register *reg,
1181             const uint chan_index,
1182             enum tgsi_exec_datatype src_datatype)
1183{
1184   union tgsi_exec_channel index;
1185   union tgsi_exec_channel index2D;
1186   uint swizzle;
1187
1188   /* We start with a direct index into a register file.
1189    *
1190    *    file[1],
1191    *    where:
1192    *       file = Register.File
1193    *       [1] = Register.Index
1194    */
1195   index.i[0] =
1196   index.i[1] =
1197   index.i[2] =
1198   index.i[3] = reg->Register.Index;
1199
1200   /* There is an extra source register that indirectly subscripts
1201    * a register file. The direct index now becomes an offset
1202    * that is being added to the indirect register.
1203    *
1204    *    file[ind[2].x+1],
1205    *    where:
1206    *       ind = Indirect.File
1207    *       [2] = Indirect.Index
1208    *       .x = Indirect.SwizzleX
1209    */
1210   if (reg->Register.Indirect) {
1211      union tgsi_exec_channel index2;
1212      union tgsi_exec_channel indir_index;
1213      const uint execmask = mach->ExecMask;
1214      uint i;
1215
1216      /* which address register (always zero now) */
1217      index2.i[0] =
1218      index2.i[1] =
1219      index2.i[2] =
1220      index2.i[3] = reg->Indirect.Index;
1221      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1222      /* get current value of address register[swizzle] */
1223      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1224      fetch_src_file_channel(mach,
1225                             chan_index,
1226                             reg->Indirect.File,
1227                             swizzle,
1228                             &index2,
1229                             &ZeroVec,
1230                             &indir_index);
1231
1232      /* add value of address register to the offset */
1233      index.i[0] += indir_index.i[0];
1234      index.i[1] += indir_index.i[1];
1235      index.i[2] += indir_index.i[2];
1236      index.i[3] += indir_index.i[3];
1237
1238      /* for disabled execution channels, zero-out the index to
1239       * avoid using a potential garbage value.
1240       */
1241      for (i = 0; i < QUAD_SIZE; i++) {
1242         if ((execmask & (1 << i)) == 0)
1243            index.i[i] = 0;
1244      }
1245   }
1246
1247   /* There is an extra source register that is a second
1248    * subscript to a register file. Effectively it means that
1249    * the register file is actually a 2D array of registers.
1250    *
1251    *    file[3][1],
1252    *    where:
1253    *       [3] = Dimension.Index
1254    */
1255   if (reg->Register.Dimension) {
1256      index2D.i[0] =
1257      index2D.i[1] =
1258      index2D.i[2] =
1259      index2D.i[3] = reg->Dimension.Index;
1260
1261      /* Again, the second subscript index can be addressed indirectly
1262       * identically to the first one.
1263       * Nothing stops us from indirectly addressing the indirect register,
1264       * but there is no need for that, so we won't exercise it.
1265       *
1266       *    file[ind[4].y+3][1],
1267       *    where:
1268       *       ind = DimIndirect.File
1269       *       [4] = DimIndirect.Index
1270       *       .y = DimIndirect.SwizzleX
1271       */
1272      if (reg->Dimension.Indirect) {
1273         union tgsi_exec_channel index2;
1274         union tgsi_exec_channel indir_index;
1275         const uint execmask = mach->ExecMask;
1276         uint i;
1277
1278         index2.i[0] =
1279         index2.i[1] =
1280         index2.i[2] =
1281         index2.i[3] = reg->DimIndirect.Index;
1282
1283         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1284         fetch_src_file_channel(mach,
1285                                chan_index,
1286                                reg->DimIndirect.File,
1287                                swizzle,
1288                                &index2,
1289                                &ZeroVec,
1290                                &indir_index);
1291
1292         index2D.i[0] += indir_index.i[0];
1293         index2D.i[1] += indir_index.i[1];
1294         index2D.i[2] += indir_index.i[2];
1295         index2D.i[3] += indir_index.i[3];
1296
1297         /* for disabled execution channels, zero-out the index to
1298          * avoid using a potential garbage value.
1299          */
1300         for (i = 0; i < QUAD_SIZE; i++) {
1301            if ((execmask & (1 << i)) == 0) {
1302               index2D.i[i] = 0;
1303            }
1304         }
1305      }
1306
1307      /* If by any chance there was a need for a 3D array of register
1308       * files, we would have to check whether Dimension is followed
1309       * by a dimension register and continue the saga.
1310       */
1311   } else {
1312      index2D.i[0] =
1313      index2D.i[1] =
1314      index2D.i[2] =
1315      index2D.i[3] = 0;
1316   }
1317
1318   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1319   fetch_src_file_channel(mach,
1320                          chan_index,
1321                          reg->Register.File,
1322                          swizzle,
1323                          &index,
1324                          &index2D,
1325                          chan);
1326
1327   if (reg->Register.Absolute) {
1328      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1329         micro_abs(chan, chan);
1330      } else {
1331         micro_iabs(chan, chan);
1332      }
1333   }
1334
1335   if (reg->Register.Negate) {
1336      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1337         micro_neg(chan, chan);
1338      } else {
1339         micro_ineg(chan, chan);
1340      }
1341   }
1342}
1343
1344static void
1345store_dest(struct tgsi_exec_machine *mach,
1346           const union tgsi_exec_channel *chan,
1347           const struct tgsi_full_dst_register *reg,
1348           const struct tgsi_full_instruction *inst,
1349           uint chan_index,
1350           enum tgsi_exec_datatype dst_datatype)
1351{
1352   uint i;
1353   union tgsi_exec_channel null;
1354   union tgsi_exec_channel *dst;
1355   union tgsi_exec_channel index2D;
1356   uint execmask = mach->ExecMask;
1357   int offset = 0;  /* indirection offset */
1358   int index;
1359
1360   /* for debugging */
1361   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1362      check_inf_or_nan(chan);
1363   }
1364
1365   /* There is an extra source register that indirectly subscripts
1366    * a register file. The direct index now becomes an offset
1367    * that is being added to the indirect register.
1368    *
1369    *    file[ind[2].x+1],
1370    *    where:
1371    *       ind = Indirect.File
1372    *       [2] = Indirect.Index
1373    *       .x = Indirect.SwizzleX
1374    */
1375   if (reg->Register.Indirect) {
1376      union tgsi_exec_channel index;
1377      union tgsi_exec_channel indir_index;
1378      uint swizzle;
1379
1380      /* which address register (always zero for now) */
1381      index.i[0] =
1382      index.i[1] =
1383      index.i[2] =
1384      index.i[3] = reg->Indirect.Index;
1385
1386      /* get current value of address register[swizzle] */
1387      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1388
1389      /* fetch values from the address/indirection register */
1390      fetch_src_file_channel(mach,
1391                             chan_index,
1392                             reg->Indirect.File,
1393                             swizzle,
1394                             &index,
1395                             &ZeroVec,
1396                             &indir_index);
1397
1398      /* save indirection offset */
1399      offset = indir_index.i[0];
1400   }
1401
1402   /* There is an extra source register that is a second
1403    * subscript to a register file. Effectively it means that
1404    * the register file is actually a 2D array of registers.
1405    *
1406    *    file[3][1],
1407    *    where:
1408    *       [3] = Dimension.Index
1409    */
1410   if (reg->Register.Dimension) {
1411      index2D.i[0] =
1412      index2D.i[1] =
1413      index2D.i[2] =
1414      index2D.i[3] = reg->Dimension.Index;
1415
1416      /* Again, the second subscript index can be addressed indirectly
1417       * identically to the first one.
1418       * Nothing stops us from indirectly addressing the indirect register,
1419       * but there is no need for that, so we won't exercise it.
1420       *
1421       *    file[ind[4].y+3][1],
1422       *    where:
1423       *       ind = DimIndirect.File
1424       *       [4] = DimIndirect.Index
1425       *       .y = DimIndirect.SwizzleX
1426       */
1427      if (reg->Dimension.Indirect) {
1428         union tgsi_exec_channel index2;
1429         union tgsi_exec_channel indir_index;
1430         const uint execmask = mach->ExecMask;
1431         unsigned swizzle;
1432         uint i;
1433
1434         index2.i[0] =
1435         index2.i[1] =
1436         index2.i[2] =
1437         index2.i[3] = reg->DimIndirect.Index;
1438
1439         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1440         fetch_src_file_channel(mach,
1441                                chan_index,
1442                                reg->DimIndirect.File,
1443                                swizzle,
1444                                &index2,
1445                                &ZeroVec,
1446                                &indir_index);
1447
1448         index2D.i[0] += indir_index.i[0];
1449         index2D.i[1] += indir_index.i[1];
1450         index2D.i[2] += indir_index.i[2];
1451         index2D.i[3] += indir_index.i[3];
1452
1453         /* for disabled execution channels, zero-out the index to
1454          * avoid using a potential garbage value.
1455          */
1456         for (i = 0; i < QUAD_SIZE; i++) {
1457            if ((execmask & (1 << i)) == 0) {
1458               index2D.i[i] = 0;
1459            }
1460         }
1461      }
1462
1463      /* If by any chance there was a need for a 3D array of register
1464       * files, we would have to check whether Dimension is followed
1465       * by a dimension register and continue the saga.
1466       */
1467   } else {
1468      index2D.i[0] =
1469      index2D.i[1] =
1470      index2D.i[2] =
1471      index2D.i[3] = 0;
1472   }
1473
1474   switch (reg->Register.File) {
1475   case TGSI_FILE_NULL:
1476      dst = &null;
1477      break;
1478
1479   case TGSI_FILE_OUTPUT:
1480      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1481         + reg->Register.Index;
1482      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1483#if 0
1484      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1485         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1486         for (i = 0; i < QUAD_SIZE; i++)
1487            if (execmask & (1 << i))
1488               fprintf(stderr, "%f, ", chan->f[i]);
1489         fprintf(stderr, ")\n");
1490      }
1491#endif
1492      break;
1493
1494   case TGSI_FILE_TEMPORARY:
1495      index = reg->Register.Index;
1496      assert( index < TGSI_EXEC_NUM_TEMPS );
1497      dst = &mach->Temps[offset + index].xyzw[chan_index];
1498      break;
1499
1500   case TGSI_FILE_TEMPORARY_ARRAY:
1501      index = reg->Register.Index;
1502      assert( index < TGSI_EXEC_NUM_TEMPS );
1503      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1504      /* XXX we use index2D.i[0] here but somehow we might
1505       * end up with someone trying to store indirectly in
1506       * different buffers */
1507      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1508      break;
1509
1510   case TGSI_FILE_ADDRESS:
1511      index = reg->Register.Index;
1512      dst = &mach->Addrs[index].xyzw[chan_index];
1513      break;
1514
1515   case TGSI_FILE_PREDICATE:
1516      index = reg->Register.Index;
1517      assert(index < TGSI_EXEC_NUM_PREDS);
1518      dst = &mach->Predicates[index].xyzw[chan_index];
1519      break;
1520
1521   default:
1522      assert( 0 );
1523      return;
1524   }
1525
1526   if (inst->Instruction.Predicate) {
1527      uint swizzle;
1528      union tgsi_exec_channel *pred;
1529
1530      switch (chan_index) {
1531      case CHAN_X:
1532         swizzle = inst->Predicate.SwizzleX;
1533         break;
1534      case CHAN_Y:
1535         swizzle = inst->Predicate.SwizzleY;
1536         break;
1537      case CHAN_Z:
1538         swizzle = inst->Predicate.SwizzleZ;
1539         break;
1540      case CHAN_W:
1541         swizzle = inst->Predicate.SwizzleW;
1542         break;
1543      default:
1544         assert(0);
1545         return;
1546      }
1547
1548      assert(inst->Predicate.Index == 0);
1549
1550      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1551
1552      if (inst->Predicate.Negate) {
1553         for (i = 0; i < QUAD_SIZE; i++) {
1554            if (pred->u[i]) {
1555               execmask &= ~(1 << i);
1556            }
1557         }
1558      } else {
1559         for (i = 0; i < QUAD_SIZE; i++) {
1560            if (!pred->u[i]) {
1561               execmask &= ~(1 << i);
1562            }
1563         }
1564      }
1565   }
1566
1567   switch (inst->Instruction.Saturate) {
1568   case TGSI_SAT_NONE:
1569      for (i = 0; i < QUAD_SIZE; i++)
1570         if (execmask & (1 << i))
1571            dst->i[i] = chan->i[i];
1572      break;
1573
1574   case TGSI_SAT_ZERO_ONE:
1575      for (i = 0; i < QUAD_SIZE; i++)
1576         if (execmask & (1 << i)) {
1577            if (chan->f[i] < 0.0f)
1578               dst->f[i] = 0.0f;
1579            else if (chan->f[i] > 1.0f)
1580               dst->f[i] = 1.0f;
1581            else
1582               dst->i[i] = chan->i[i];
1583         }
1584      break;
1585
1586   case TGSI_SAT_MINUS_PLUS_ONE:
1587      for (i = 0; i < QUAD_SIZE; i++)
1588         if (execmask & (1 << i)) {
1589            if (chan->f[i] < -1.0f)
1590               dst->f[i] = -1.0f;
1591            else if (chan->f[i] > 1.0f)
1592               dst->f[i] = 1.0f;
1593            else
1594               dst->i[i] = chan->i[i];
1595         }
1596      break;
1597
1598   default:
1599      assert( 0 );
1600   }
1601}
1602
1603#define FETCH(VAL,INDEX,CHAN)\
1604    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1605
1606#define IFETCH(VAL,INDEX,CHAN)\
1607    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1608
1609
1610/**
1611 * Execute ARB-style KIL which is predicated by a src register.
1612 * Kill fragment if any of the four values is less than zero.
1613 */
1614static void
1615exec_kil(struct tgsi_exec_machine *mach,
1616         const struct tgsi_full_instruction *inst)
1617{
1618   uint uniquemask;
1619   uint chan_index;
1620   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1621   union tgsi_exec_channel r[1];
1622
1623   /* This mask stores component bits that were already tested. */
1624   uniquemask = 0;
1625
1626   for (chan_index = 0; chan_index < 4; chan_index++)
1627   {
1628      uint swizzle;
1629      uint i;
1630
1631      /* unswizzle channel */
1632      swizzle = tgsi_util_get_full_src_register_swizzle (
1633                        &inst->Src[0],
1634                        chan_index);
1635
1636      /* check if the component has not been already tested */
1637      if (uniquemask & (1 << swizzle))
1638         continue;
1639      uniquemask |= 1 << swizzle;
1640
1641      FETCH(&r[0], 0, chan_index);
1642      for (i = 0; i < 4; i++)
1643         if (r[0].f[i] < 0.0f)
1644            kilmask |= 1 << i;
1645   }
1646
1647   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1648}
1649
1650/**
1651 * Execute NVIDIA-style KIL which is predicated by a condition code.
1652 * Kill fragment if the condition code is TRUE.
1653 */
1654static void
1655exec_kilp(struct tgsi_exec_machine *mach,
1656          const struct tgsi_full_instruction *inst)
1657{
1658   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1659
1660   /* "unconditional" kil */
1661   kilmask = mach->ExecMask;
1662   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1663}
1664
1665static void
1666emit_vertex(struct tgsi_exec_machine *mach)
1667{
1668   /* FIXME: check for exec mask correctly
1669   unsigned i;
1670   for (i = 0; i < QUAD_SIZE; ++i) {
1671         if ((mach->ExecMask & (1 << i)))
1672   */
1673   if (mach->ExecMask) {
1674      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1675      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1676   }
1677}
1678
1679static void
1680emit_primitive(struct tgsi_exec_machine *mach)
1681{
1682   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1683   /* FIXME: check for exec mask correctly
1684   unsigned i;
1685   for (i = 0; i < QUAD_SIZE; ++i) {
1686         if ((mach->ExecMask & (1 << i)))
1687   */
1688   if (mach->ExecMask) {
1689      ++(*prim_count);
1690      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1691      mach->Primitives[*prim_count] = 0;
1692   }
1693}
1694
1695static void
1696conditional_emit_primitive(struct tgsi_exec_machine *mach)
1697{
1698   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1699      int emitted_verts =
1700         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1701      if (emitted_verts) {
1702         emit_primitive(mach);
1703      }
1704   }
1705}
1706
1707
1708/*
1709 * Fetch four texture samples using STR texture coordinates.
1710 */
1711static void
1712fetch_texel( struct tgsi_sampler *sampler,
1713             const union tgsi_exec_channel *s,
1714             const union tgsi_exec_channel *t,
1715             const union tgsi_exec_channel *p,
1716             const union tgsi_exec_channel *c0,
1717             enum tgsi_sampler_control control,
1718             union tgsi_exec_channel *r,
1719             union tgsi_exec_channel *g,
1720             union tgsi_exec_channel *b,
1721             union tgsi_exec_channel *a )
1722{
1723   uint j;
1724   float rgba[NUM_CHANNELS][QUAD_SIZE];
1725
1726   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1727
1728   for (j = 0; j < 4; j++) {
1729      r->f[j] = rgba[0][j];
1730      g->f[j] = rgba[1][j];
1731      b->f[j] = rgba[2][j];
1732      a->f[j] = rgba[3][j];
1733   }
1734}
1735
1736
1737#define TEX_MODIFIER_NONE           0
1738#define TEX_MODIFIER_PROJECTED      1
1739#define TEX_MODIFIER_LOD_BIAS       2
1740#define TEX_MODIFIER_EXPLICIT_LOD   3
1741
1742
1743static void
1744exec_tex(struct tgsi_exec_machine *mach,
1745         const struct tgsi_full_instruction *inst,
1746         uint modifier)
1747{
1748   const uint unit = inst->Src[1].Register.Index;
1749   union tgsi_exec_channel r[4];
1750   const union tgsi_exec_channel *lod = &ZeroVec;
1751   enum tgsi_sampler_control control;
1752   uint chan;
1753
1754   if (modifier != TEX_MODIFIER_NONE) {
1755      FETCH(&r[3], 0, CHAN_W);
1756      if (modifier != TEX_MODIFIER_PROJECTED) {
1757         lod = &r[3];
1758      }
1759   }
1760
1761   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1762      control = tgsi_sampler_lod_explicit;
1763   } else {
1764      control = tgsi_sampler_lod_bias;
1765   }
1766
1767   switch (inst->Texture.Texture) {
1768   case TGSI_TEXTURE_1D:
1769      FETCH(&r[0], 0, CHAN_X);
1770
1771      if (modifier == TEX_MODIFIER_PROJECTED) {
1772         micro_div(&r[0], &r[0], &r[3]);
1773      }
1774
1775      fetch_texel(mach->Samplers[unit],
1776                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1777                  control,
1778                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1779      break;
1780   case TGSI_TEXTURE_SHADOW1D:
1781      FETCH(&r[0], 0, CHAN_X);
1782      FETCH(&r[2], 0, CHAN_Z);
1783
1784      if (modifier == TEX_MODIFIER_PROJECTED) {
1785         micro_div(&r[0], &r[0], &r[3]);
1786      }
1787
1788      fetch_texel(mach->Samplers[unit],
1789                  &r[0], &ZeroVec, &r[2], lod,  /* S, T, P, LOD */
1790                  control,
1791                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1792      break;
1793
1794   case TGSI_TEXTURE_2D:
1795   case TGSI_TEXTURE_RECT:
1796   case TGSI_TEXTURE_SHADOW2D:
1797   case TGSI_TEXTURE_SHADOWRECT:
1798      FETCH(&r[0], 0, CHAN_X);
1799      FETCH(&r[1], 0, CHAN_Y);
1800      FETCH(&r[2], 0, CHAN_Z);
1801
1802      if (modifier == TEX_MODIFIER_PROJECTED) {
1803         micro_div(&r[0], &r[0], &r[3]);
1804         micro_div(&r[1], &r[1], &r[3]);
1805         micro_div(&r[2], &r[2], &r[3]);
1806      }
1807
1808      fetch_texel(mach->Samplers[unit],
1809                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1810                  control,
1811                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1812      break;
1813
1814   case TGSI_TEXTURE_1D_ARRAY:
1815      FETCH(&r[0], 0, CHAN_X);
1816      FETCH(&r[1], 0, CHAN_Y);
1817
1818      if (modifier == TEX_MODIFIER_PROJECTED) {
1819         micro_div(&r[0], &r[0], &r[3]);
1820      }
1821
1822      fetch_texel(mach->Samplers[unit],
1823                  &r[0], &r[1], &ZeroVec, lod,     /* S, T, P, LOD */
1824                  control,
1825                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1826      break;
1827   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1828      FETCH(&r[0], 0, CHAN_X);
1829      FETCH(&r[1], 0, CHAN_Y);
1830      FETCH(&r[2], 0, CHAN_Z);
1831
1832      if (modifier == TEX_MODIFIER_PROJECTED) {
1833         micro_div(&r[0], &r[0], &r[3]);
1834      }
1835
1836      fetch_texel(mach->Samplers[unit],
1837                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1838                  control,
1839                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1840      break;
1841
1842   case TGSI_TEXTURE_2D_ARRAY:
1843      FETCH(&r[0], 0, CHAN_X);
1844      FETCH(&r[1], 0, CHAN_Y);
1845      FETCH(&r[2], 0, CHAN_Z);
1846
1847      if (modifier == TEX_MODIFIER_PROJECTED) {
1848         micro_div(&r[0], &r[0], &r[3]);
1849         micro_div(&r[1], &r[1], &r[3]);
1850      }
1851
1852      fetch_texel(mach->Samplers[unit],
1853                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1854                  control,
1855                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1856      break;
1857   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1858      FETCH(&r[0], 0, CHAN_X);
1859      FETCH(&r[1], 0, CHAN_Y);
1860      FETCH(&r[2], 0, CHAN_Z);
1861      FETCH(&r[3], 0, CHAN_W);
1862
1863      fetch_texel(mach->Samplers[unit],
1864                  &r[0], &r[1], &r[2], &r[3],     /* S, T, P, LOD */
1865                  control,
1866                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1867      break;
1868   case TGSI_TEXTURE_3D:
1869   case TGSI_TEXTURE_CUBE:
1870      FETCH(&r[0], 0, CHAN_X);
1871      FETCH(&r[1], 0, CHAN_Y);
1872      FETCH(&r[2], 0, CHAN_Z);
1873
1874      if (modifier == TEX_MODIFIER_PROJECTED) {
1875         micro_div(&r[0], &r[0], &r[3]);
1876         micro_div(&r[1], &r[1], &r[3]);
1877         micro_div(&r[2], &r[2], &r[3]);
1878      }
1879
1880      fetch_texel(mach->Samplers[unit],
1881                  &r[0], &r[1], &r[2], lod,
1882                  control,
1883                  &r[0], &r[1], &r[2], &r[3]);
1884      break;
1885
1886   default:
1887      assert(0);
1888   }
1889
1890#if 0
1891   debug_printf("fetch r: %g %g %g %g\n",
1892         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
1893   debug_printf("fetch g: %g %g %g %g\n",
1894         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
1895   debug_printf("fetch b: %g %g %g %g\n",
1896         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
1897   debug_printf("fetch a: %g %g %g %g\n",
1898         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
1899#endif
1900
1901   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1902      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1903         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1904      }
1905   }
1906}
1907
1908static void
1909exec_txd(struct tgsi_exec_machine *mach,
1910         const struct tgsi_full_instruction *inst)
1911{
1912   const uint unit = inst->Src[3].Register.Index;
1913   union tgsi_exec_channel r[4];
1914   uint chan;
1915
1916   /*
1917    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1918    */
1919
1920   switch (inst->Texture.Texture) {
1921   case TGSI_TEXTURE_1D:
1922   case TGSI_TEXTURE_SHADOW1D:
1923
1924      FETCH(&r[0], 0, CHAN_X);
1925
1926      fetch_texel(mach->Samplers[unit],
1927                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1928                  tgsi_sampler_lod_bias,
1929                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1930      break;
1931
1932   case TGSI_TEXTURE_1D_ARRAY:
1933   case TGSI_TEXTURE_2D:
1934   case TGSI_TEXTURE_RECT:
1935   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1936   case TGSI_TEXTURE_SHADOW2D:
1937   case TGSI_TEXTURE_SHADOWRECT:
1938
1939      FETCH(&r[0], 0, CHAN_X);
1940      FETCH(&r[1], 0, CHAN_Y);
1941      FETCH(&r[2], 0, CHAN_Z);
1942
1943      fetch_texel(mach->Samplers[unit],
1944                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1945                  tgsi_sampler_lod_bias,
1946                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1947      break;
1948
1949   case TGSI_TEXTURE_2D_ARRAY:
1950   case TGSI_TEXTURE_3D:
1951   case TGSI_TEXTURE_CUBE:
1952
1953      FETCH(&r[0], 0, CHAN_X);
1954      FETCH(&r[1], 0, CHAN_Y);
1955      FETCH(&r[2], 0, CHAN_Z);
1956
1957      fetch_texel(mach->Samplers[unit],
1958                  &r[0], &r[1], &r[2], &ZeroVec,
1959                  tgsi_sampler_lod_bias,
1960                  &r[0], &r[1], &r[2], &r[3]);
1961      break;
1962
1963   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1964
1965      FETCH(&r[0], 0, CHAN_X);
1966      FETCH(&r[1], 0, CHAN_Y);
1967      FETCH(&r[2], 0, CHAN_Z);
1968      FETCH(&r[3], 0, CHAN_W);
1969
1970      fetch_texel(mach->Samplers[unit],
1971                  &r[0], &r[1], &r[2], &r[3],
1972                  tgsi_sampler_lod_bias,
1973                  &r[0], &r[1], &r[2], &r[3]);
1974      break;
1975
1976   default:
1977      assert(0);
1978   }
1979
1980   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1981      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1982         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1983      }
1984   }
1985}
1986
1987
1988static void
1989exec_txf(struct tgsi_exec_machine *mach,
1990	 const struct tgsi_full_instruction *inst)
1991{
1992   struct tgsi_sampler *sampler;
1993   const uint unit = inst->Src[2].Register.Index;
1994   union tgsi_exec_channel r[4];
1995   union tgsi_exec_channel offset[3];
1996   uint chan;
1997   float rgba[NUM_CHANNELS][QUAD_SIZE];
1998   int j;
1999   int8_t offsets[3];
2000
2001   if (inst->Texture.NumOffsets == 1) {
2002      union tgsi_exec_channel index;
2003      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2004      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2005                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2006      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2007                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2008      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2009                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2010     offsets[0] = offset[0].i[0];
2011     offsets[1] = offset[1].i[0];
2012     offsets[2] = offset[2].i[0];
2013   } else
2014     offsets[0] = offsets[1] = offsets[2] = 0;
2015
2016   IFETCH(&r[3], 0, CHAN_W);
2017
2018   switch(inst->Texture.Texture) {
2019   case TGSI_TEXTURE_3D:
2020   case TGSI_TEXTURE_2D_ARRAY:
2021   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2022      IFETCH(&r[2], 0, CHAN_Z);
2023      /* fallthrough */
2024   case TGSI_TEXTURE_2D:
2025   case TGSI_TEXTURE_RECT:
2026   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2027   case TGSI_TEXTURE_SHADOW2D:
2028   case TGSI_TEXTURE_SHADOWRECT:
2029   case TGSI_TEXTURE_1D_ARRAY:
2030      IFETCH(&r[1], 0, CHAN_Y);
2031      /* fallthrough */
2032   case TGSI_TEXTURE_1D:
2033   case TGSI_TEXTURE_SHADOW1D:
2034      IFETCH(&r[0], 0, CHAN_X);
2035      break;
2036   default:
2037      assert(0);
2038      break;
2039   }
2040
2041   sampler = mach->Samplers[unit];
2042   sampler->get_texel(sampler, r[0].i, r[1].i, r[2].i, r[3].i,
2043		      offsets, rgba);
2044
2045   for (j = 0; j < QUAD_SIZE; j++) {
2046      r[0].f[j] = rgba[0][j];
2047      r[1].f[j] = rgba[1][j];
2048      r[2].f[j] = rgba[2][j];
2049      r[3].f[j] = rgba[3][j];
2050   }
2051
2052   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2053      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2054         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2055      }
2056   }
2057}
2058
2059static void
2060exec_txq(struct tgsi_exec_machine *mach,
2061         const struct tgsi_full_instruction *inst)
2062{
2063   struct tgsi_sampler *sampler;
2064   const uint unit = inst->Src[1].Register.Index;
2065   int result[4];
2066   union tgsi_exec_channel r[4], src;
2067   uint chan;
2068   int i,j;
2069
2070   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_INT);
2071   sampler = mach->Samplers[unit];
2072
2073   sampler->get_dims(sampler, src.i[0], result);
2074
2075   for (i = 0; i < QUAD_SIZE; i++) {
2076      for (j = 0; j < 4; j++) {
2077	 r[j].i[i] = result[j];
2078      }
2079   }
2080
2081   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2082      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2083	 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2084		    TGSI_EXEC_DATA_INT);
2085      }
2086   }
2087}
2088
2089static void
2090exec_sample(struct tgsi_exec_machine *mach,
2091            const struct tgsi_full_instruction *inst,
2092            uint modifier)
2093{
2094   const uint resource_unit = inst->Src[1].Register.Index;
2095   const uint sampler_unit = inst->Src[2].Register.Index;
2096   union tgsi_exec_channel r[4];
2097   const union tgsi_exec_channel *lod = &ZeroVec;
2098   enum tgsi_sampler_control control;
2099   uint chan;
2100
2101   if (modifier != TEX_MODIFIER_NONE) {
2102      if (modifier == TEX_MODIFIER_LOD_BIAS)
2103         FETCH(&r[3], 3, CHAN_X);
2104      else /*TEX_MODIFIER_LOD*/
2105         FETCH(&r[3], 0, CHAN_W);
2106
2107      if (modifier != TEX_MODIFIER_PROJECTED) {
2108         lod = &r[3];
2109      }
2110   }
2111
2112   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2113      control = tgsi_sampler_lod_explicit;
2114   } else {
2115      control = tgsi_sampler_lod_bias;
2116   }
2117
2118   switch (mach->Resources[resource_unit].Resource) {
2119   case TGSI_TEXTURE_1D:
2120   case TGSI_TEXTURE_SHADOW1D:
2121      FETCH(&r[0], 0, CHAN_X);
2122
2123      if (modifier == TEX_MODIFIER_PROJECTED) {
2124         micro_div(&r[0], &r[0], &r[3]);
2125      }
2126
2127      fetch_texel(mach->Samplers[sampler_unit],
2128                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
2129                  control,
2130                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2131      break;
2132
2133   case TGSI_TEXTURE_1D_ARRAY:
2134   case TGSI_TEXTURE_2D:
2135   case TGSI_TEXTURE_RECT:
2136   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2137   case TGSI_TEXTURE_SHADOW2D:
2138   case TGSI_TEXTURE_SHADOWRECT:
2139      FETCH(&r[0], 0, CHAN_X);
2140      FETCH(&r[1], 0, CHAN_Y);
2141      FETCH(&r[2], 0, CHAN_Z);
2142
2143      if (modifier == TEX_MODIFIER_PROJECTED) {
2144         micro_div(&r[0], &r[0], &r[3]);
2145         micro_div(&r[1], &r[1], &r[3]);
2146         micro_div(&r[2], &r[2], &r[3]);
2147      }
2148
2149      fetch_texel(mach->Samplers[sampler_unit],
2150                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
2151                  control,
2152                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2153      break;
2154
2155   case TGSI_TEXTURE_2D_ARRAY:
2156   case TGSI_TEXTURE_3D:
2157   case TGSI_TEXTURE_CUBE:
2158      FETCH(&r[0], 0, CHAN_X);
2159      FETCH(&r[1], 0, CHAN_Y);
2160      FETCH(&r[2], 0, CHAN_Z);
2161
2162      if (modifier == TEX_MODIFIER_PROJECTED) {
2163         micro_div(&r[0], &r[0], &r[3]);
2164         micro_div(&r[1], &r[1], &r[3]);
2165         micro_div(&r[2], &r[2], &r[3]);
2166      }
2167
2168      fetch_texel(mach->Samplers[sampler_unit],
2169                  &r[0], &r[1], &r[2], lod,
2170                  control,
2171                  &r[0], &r[1], &r[2], &r[3]);
2172      break;
2173
2174   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2175      FETCH(&r[0], 0, CHAN_X);
2176      FETCH(&r[1], 0, CHAN_Y);
2177      FETCH(&r[2], 0, CHAN_Z);
2178      FETCH(&r[3], 0, CHAN_W);
2179
2180      assert(modifier != TEX_MODIFIER_PROJECTED);
2181
2182      fetch_texel(mach->Samplers[sampler_unit],
2183                  &r[0], &r[1], &r[2], &r[3],
2184                  control,
2185                  &r[0], &r[1], &r[2], &r[3]);
2186      break;
2187
2188   default:
2189      assert(0);
2190   }
2191
2192   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2193      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2194         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2195      }
2196   }
2197}
2198
2199static void
2200exec_sample_d(struct tgsi_exec_machine *mach,
2201              const struct tgsi_full_instruction *inst)
2202{
2203   const uint resource_unit = inst->Src[1].Register.Index;
2204   const uint sampler_unit = inst->Src[2].Register.Index;
2205   union tgsi_exec_channel r[4];
2206   uint chan;
2207   /*
2208    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
2209    */
2210
2211   switch (mach->Resources[resource_unit].Resource) {
2212   case TGSI_TEXTURE_1D:
2213   case TGSI_TEXTURE_SHADOW1D:
2214
2215      FETCH(&r[0], 0, CHAN_X);
2216
2217      fetch_texel(mach->Samplers[sampler_unit],
2218                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
2219                  tgsi_sampler_lod_bias,
2220                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2221      break;
2222
2223   case TGSI_TEXTURE_2D:
2224   case TGSI_TEXTURE_RECT:
2225   case TGSI_TEXTURE_SHADOW2D:
2226   case TGSI_TEXTURE_SHADOWRECT:
2227
2228      FETCH(&r[0], 0, CHAN_X);
2229      FETCH(&r[1], 0, CHAN_Y);
2230      FETCH(&r[2], 0, CHAN_Z);
2231
2232      fetch_texel(mach->Samplers[sampler_unit],
2233                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2234                  tgsi_sampler_lod_bias,
2235                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2236      break;
2237
2238   case TGSI_TEXTURE_3D:
2239   case TGSI_TEXTURE_CUBE:
2240
2241      FETCH(&r[0], 0, CHAN_X);
2242      FETCH(&r[1], 0, CHAN_Y);
2243      FETCH(&r[2], 0, CHAN_Z);
2244
2245      fetch_texel(mach->Samplers[sampler_unit],
2246                  &r[0], &r[1], &r[2], &ZeroVec,
2247                  tgsi_sampler_lod_bias,
2248                  &r[0], &r[1], &r[2], &r[3]);
2249      break;
2250
2251   default:
2252      assert(0);
2253   }
2254
2255   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2256      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2257         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2258      }
2259   }
2260}
2261
2262
2263/**
2264 * Evaluate a constant-valued coefficient at the position of the
2265 * current quad.
2266 */
2267static void
2268eval_constant_coef(
2269   struct tgsi_exec_machine *mach,
2270   unsigned attrib,
2271   unsigned chan )
2272{
2273   unsigned i;
2274
2275   for( i = 0; i < QUAD_SIZE; i++ ) {
2276      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2277   }
2278}
2279
2280/**
2281 * Evaluate a linear-valued coefficient at the position of the
2282 * current quad.
2283 */
2284static void
2285eval_linear_coef(
2286   struct tgsi_exec_machine *mach,
2287   unsigned attrib,
2288   unsigned chan )
2289{
2290   const float x = mach->QuadPos.xyzw[0].f[0];
2291   const float y = mach->QuadPos.xyzw[1].f[0];
2292   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2293   const float dady = mach->InterpCoefs[attrib].dady[chan];
2294   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2295   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2296   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2297   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2298   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2299}
2300
2301/**
2302 * Evaluate a perspective-valued coefficient at the position of the
2303 * current quad.
2304 */
2305static void
2306eval_perspective_coef(
2307   struct tgsi_exec_machine *mach,
2308   unsigned attrib,
2309   unsigned chan )
2310{
2311   const float x = mach->QuadPos.xyzw[0].f[0];
2312   const float y = mach->QuadPos.xyzw[1].f[0];
2313   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2314   const float dady = mach->InterpCoefs[attrib].dady[chan];
2315   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2316   const float *w = mach->QuadPos.xyzw[3].f;
2317   /* divide by W here */
2318   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2319   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2320   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2321   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2322}
2323
2324
2325typedef void (* eval_coef_func)(
2326   struct tgsi_exec_machine *mach,
2327   unsigned attrib,
2328   unsigned chan );
2329
2330static void
2331exec_declaration(struct tgsi_exec_machine *mach,
2332                 const struct tgsi_full_declaration *decl)
2333{
2334   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
2335      mach->Resources[decl->Range.First] = decl->Resource;
2336      return;
2337   }
2338
2339   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2340      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2341         uint first, last, mask;
2342
2343         first = decl->Range.First;
2344         last = decl->Range.Last;
2345         mask = decl->Declaration.UsageMask;
2346
2347         /* XXX we could remove this special-case code since
2348          * mach->InterpCoefs[first].a0 should already have the
2349          * front/back-face value.  But we should first update the
2350          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2351          * Then, we could remove the tgsi_exec_machine::Face field.
2352          */
2353         /* XXX make FACE a system value */
2354         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2355            uint i;
2356
2357            assert(decl->Semantic.Index == 0);
2358            assert(first == last);
2359
2360            for (i = 0; i < QUAD_SIZE; i++) {
2361               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2362            }
2363         } else {
2364            eval_coef_func eval;
2365            uint i, j;
2366
2367            switch (decl->Declaration.Interpolate) {
2368            case TGSI_INTERPOLATE_CONSTANT:
2369               eval = eval_constant_coef;
2370               break;
2371
2372            case TGSI_INTERPOLATE_LINEAR:
2373               eval = eval_linear_coef;
2374               break;
2375
2376            case TGSI_INTERPOLATE_PERSPECTIVE:
2377               eval = eval_perspective_coef;
2378               break;
2379
2380            case TGSI_INTERPOLATE_COLOR:
2381               eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
2382               break;
2383
2384            default:
2385               assert(0);
2386               return;
2387            }
2388
2389            for (j = 0; j < NUM_CHANNELS; j++) {
2390               if (mask & (1 << j)) {
2391                  for (i = first; i <= last; i++) {
2392                     eval(mach, i, j);
2393                  }
2394               }
2395            }
2396         }
2397      }
2398   }
2399
2400   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2401      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2402   }
2403}
2404
2405
2406typedef void (* micro_op)(union tgsi_exec_channel *dst);
2407
2408static void
2409exec_vector(struct tgsi_exec_machine *mach,
2410            const struct tgsi_full_instruction *inst,
2411            micro_op op,
2412            enum tgsi_exec_datatype dst_datatype)
2413{
2414   unsigned int chan;
2415
2416   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2417      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2418         union tgsi_exec_channel dst;
2419
2420         op(&dst);
2421         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2422      }
2423   }
2424}
2425
2426typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2427                                const union tgsi_exec_channel *src);
2428
2429static void
2430exec_scalar_unary(struct tgsi_exec_machine *mach,
2431                  const struct tgsi_full_instruction *inst,
2432                  micro_unary_op op,
2433                  enum tgsi_exec_datatype dst_datatype,
2434                  enum tgsi_exec_datatype src_datatype)
2435{
2436   unsigned int chan;
2437   union tgsi_exec_channel src;
2438   union tgsi_exec_channel dst;
2439
2440   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
2441   op(&dst, &src);
2442   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2443      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2444         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2445      }
2446   }
2447}
2448
2449static void
2450exec_vector_unary(struct tgsi_exec_machine *mach,
2451                  const struct tgsi_full_instruction *inst,
2452                  micro_unary_op op,
2453                  enum tgsi_exec_datatype dst_datatype,
2454                  enum tgsi_exec_datatype src_datatype)
2455{
2456   unsigned int chan;
2457   struct tgsi_exec_vector dst;
2458
2459   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2460      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2461         union tgsi_exec_channel src;
2462
2463         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2464         op(&dst.xyzw[chan], &src);
2465      }
2466   }
2467   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2468      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2469         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2470      }
2471   }
2472}
2473
2474typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2475                                 const union tgsi_exec_channel *src0,
2476                                 const union tgsi_exec_channel *src1);
2477
2478static void
2479exec_scalar_binary(struct tgsi_exec_machine *mach,
2480                   const struct tgsi_full_instruction *inst,
2481                   micro_binary_op op,
2482                   enum tgsi_exec_datatype dst_datatype,
2483                   enum tgsi_exec_datatype src_datatype)
2484{
2485   unsigned int chan;
2486   union tgsi_exec_channel src[2];
2487   union tgsi_exec_channel dst;
2488
2489   fetch_source(mach, &src[0], &inst->Src[0], CHAN_X, src_datatype);
2490   fetch_source(mach, &src[1], &inst->Src[1], CHAN_Y, src_datatype);
2491   op(&dst, &src[0], &src[1]);
2492   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2493      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2494         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2495      }
2496   }
2497}
2498
2499static void
2500exec_vector_binary(struct tgsi_exec_machine *mach,
2501                   const struct tgsi_full_instruction *inst,
2502                   micro_binary_op op,
2503                   enum tgsi_exec_datatype dst_datatype,
2504                   enum tgsi_exec_datatype src_datatype)
2505{
2506   unsigned int chan;
2507   struct tgsi_exec_vector dst;
2508
2509   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2510      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2511         union tgsi_exec_channel src[2];
2512
2513         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2514         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2515         op(&dst.xyzw[chan], &src[0], &src[1]);
2516      }
2517   }
2518   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2519      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2520         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2521      }
2522   }
2523}
2524
2525typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2526                                  const union tgsi_exec_channel *src0,
2527                                  const union tgsi_exec_channel *src1,
2528                                  const union tgsi_exec_channel *src2);
2529
2530static void
2531exec_vector_trinary(struct tgsi_exec_machine *mach,
2532                    const struct tgsi_full_instruction *inst,
2533                    micro_trinary_op op,
2534                    enum tgsi_exec_datatype dst_datatype,
2535                    enum tgsi_exec_datatype src_datatype)
2536{
2537   unsigned int chan;
2538   struct tgsi_exec_vector dst;
2539
2540   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2541      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2542         union tgsi_exec_channel src[3];
2543
2544         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2545         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2546         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2547         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2548      }
2549   }
2550   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2551      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2552         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2553      }
2554   }
2555}
2556
2557static void
2558exec_dp3(struct tgsi_exec_machine *mach,
2559         const struct tgsi_full_instruction *inst)
2560{
2561   unsigned int chan;
2562   union tgsi_exec_channel arg[3];
2563
2564   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2565   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2566   micro_mul(&arg[2], &arg[0], &arg[1]);
2567
2568   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2569      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2570      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2571      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2572   }
2573
2574   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2575      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2576         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2577      }
2578   }
2579}
2580
2581static void
2582exec_dp4(struct tgsi_exec_machine *mach,
2583         const struct tgsi_full_instruction *inst)
2584{
2585   unsigned int chan;
2586   union tgsi_exec_channel arg[3];
2587
2588   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2589   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2590   micro_mul(&arg[2], &arg[0], &arg[1]);
2591
2592   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2593      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2594      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2595      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2596   }
2597
2598   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2599      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2600         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2601      }
2602   }
2603}
2604
2605static void
2606exec_dp2a(struct tgsi_exec_machine *mach,
2607          const struct tgsi_full_instruction *inst)
2608{
2609   unsigned int chan;
2610   union tgsi_exec_channel arg[3];
2611
2612   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2613   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2614   micro_mul(&arg[2], &arg[0], &arg[1]);
2615
2616   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2617   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2618   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2619
2620   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2621   micro_add(&arg[0], &arg[0], &arg[1]);
2622
2623   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2624      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2625         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2626      }
2627   }
2628}
2629
2630static void
2631exec_dph(struct tgsi_exec_machine *mach,
2632         const struct tgsi_full_instruction *inst)
2633{
2634   unsigned int chan;
2635   union tgsi_exec_channel arg[3];
2636
2637   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2638   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2639   micro_mul(&arg[2], &arg[0], &arg[1]);
2640
2641   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2642   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2643   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2644
2645   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2646   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2647   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2648
2649   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2650   micro_add(&arg[0], &arg[0], &arg[1]);
2651
2652   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2653      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2654         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2655      }
2656   }
2657}
2658
2659static void
2660exec_dp2(struct tgsi_exec_machine *mach,
2661         const struct tgsi_full_instruction *inst)
2662{
2663   unsigned int chan;
2664   union tgsi_exec_channel arg[3];
2665
2666   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2667   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2668   micro_mul(&arg[2], &arg[0], &arg[1]);
2669
2670   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2671   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2672   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2673
2674   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2675      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2676         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2677      }
2678   }
2679}
2680
2681static void
2682exec_nrm4(struct tgsi_exec_machine *mach,
2683          const struct tgsi_full_instruction *inst)
2684{
2685   unsigned int chan;
2686   union tgsi_exec_channel arg[4];
2687   union tgsi_exec_channel scale;
2688
2689   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2690   micro_mul(&scale, &arg[0], &arg[0]);
2691
2692   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2693      union tgsi_exec_channel product;
2694
2695      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2696      micro_mul(&product, &arg[chan], &arg[chan]);
2697      micro_add(&scale, &scale, &product);
2698   }
2699
2700   micro_rsq(&scale, &scale);
2701
2702   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2703      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2704         micro_mul(&arg[chan], &arg[chan], &scale);
2705         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2706      }
2707   }
2708}
2709
2710static void
2711exec_nrm3(struct tgsi_exec_machine *mach,
2712          const struct tgsi_full_instruction *inst)
2713{
2714   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2715      unsigned int chan;
2716      union tgsi_exec_channel arg[3];
2717      union tgsi_exec_channel scale;
2718
2719      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2720      micro_mul(&scale, &arg[0], &arg[0]);
2721
2722      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2723         union tgsi_exec_channel product;
2724
2725         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2726         micro_mul(&product, &arg[chan], &arg[chan]);
2727         micro_add(&scale, &scale, &product);
2728      }
2729
2730      micro_rsq(&scale, &scale);
2731
2732      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2733         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2734            micro_mul(&arg[chan], &arg[chan], &scale);
2735            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2736         }
2737      }
2738   }
2739
2740   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2741      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2742   }
2743}
2744
2745static void
2746exec_scs(struct tgsi_exec_machine *mach,
2747         const struct tgsi_full_instruction *inst)
2748{
2749   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2750      union tgsi_exec_channel arg;
2751      union tgsi_exec_channel result;
2752
2753      fetch_source(mach, &arg, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2754
2755      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2756         micro_cos(&result, &arg);
2757         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2758      }
2759      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2760         micro_sin(&result, &arg);
2761         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2762      }
2763   }
2764   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2765      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2766   }
2767   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2768      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2769   }
2770}
2771
2772static void
2773exec_x2d(struct tgsi_exec_machine *mach,
2774         const struct tgsi_full_instruction *inst)
2775{
2776   union tgsi_exec_channel r[4];
2777   union tgsi_exec_channel d[2];
2778
2779   fetch_source(mach, &r[0], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2780   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2781   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2782      fetch_source(mach, &r[2], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2783      micro_mul(&r[2], &r[2], &r[0]);
2784      fetch_source(mach, &r[3], &inst->Src[2], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2785      micro_mul(&r[3], &r[3], &r[1]);
2786      micro_add(&r[2], &r[2], &r[3]);
2787      fetch_source(mach, &r[3], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2788      micro_add(&d[0], &r[2], &r[3]);
2789   }
2790   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2791      fetch_source(mach, &r[2], &inst->Src[2], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2792      micro_mul(&r[2], &r[2], &r[0]);
2793      fetch_source(mach, &r[3], &inst->Src[2], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2794      micro_mul(&r[3], &r[3], &r[1]);
2795      micro_add(&r[2], &r[2], &r[3]);
2796      fetch_source(mach, &r[3], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2797      micro_add(&d[1], &r[2], &r[3]);
2798   }
2799   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2800      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2801   }
2802   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2803      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2804   }
2805   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2806      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2807   }
2808   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2809      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2810   }
2811}
2812
2813static void
2814exec_rfl(struct tgsi_exec_machine *mach,
2815         const struct tgsi_full_instruction *inst)
2816{
2817   union tgsi_exec_channel r[9];
2818
2819   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2820      /* r0 = dp3(src0, src0) */
2821      fetch_source(mach, &r[2], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2822      micro_mul(&r[0], &r[2], &r[2]);
2823      fetch_source(mach, &r[4], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2824      micro_mul(&r[8], &r[4], &r[4]);
2825      micro_add(&r[0], &r[0], &r[8]);
2826      fetch_source(mach, &r[6], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2827      micro_mul(&r[8], &r[6], &r[6]);
2828      micro_add(&r[0], &r[0], &r[8]);
2829
2830      /* r1 = dp3(src0, src1) */
2831      fetch_source(mach, &r[3], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2832      micro_mul(&r[1], &r[2], &r[3]);
2833      fetch_source(mach, &r[5], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2834      micro_mul(&r[8], &r[4], &r[5]);
2835      micro_add(&r[1], &r[1], &r[8]);
2836      fetch_source(mach, &r[7], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2837      micro_mul(&r[8], &r[6], &r[7]);
2838      micro_add(&r[1], &r[1], &r[8]);
2839
2840      /* r1 = 2 * r1 / r0 */
2841      micro_add(&r[1], &r[1], &r[1]);
2842      micro_div(&r[1], &r[1], &r[0]);
2843
2844      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2845         micro_mul(&r[2], &r[2], &r[1]);
2846         micro_sub(&r[2], &r[2], &r[3]);
2847         store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2848      }
2849      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2850         micro_mul(&r[4], &r[4], &r[1]);
2851         micro_sub(&r[4], &r[4], &r[5]);
2852         store_dest(mach, &r[4], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2853      }
2854      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2855         micro_mul(&r[6], &r[6], &r[1]);
2856         micro_sub(&r[6], &r[6], &r[7]);
2857         store_dest(mach, &r[6], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2858      }
2859   }
2860   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2861      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2862   }
2863}
2864
2865static void
2866exec_xpd(struct tgsi_exec_machine *mach,
2867         const struct tgsi_full_instruction *inst)
2868{
2869   union tgsi_exec_channel r[6];
2870   union tgsi_exec_channel d[3];
2871
2872   fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2873   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2874
2875   micro_mul(&r[2], &r[0], &r[1]);
2876
2877   fetch_source(mach, &r[3], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2878   fetch_source(mach, &r[4], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2879
2880   micro_mul(&r[5], &r[3], &r[4] );
2881   micro_sub(&d[CHAN_X], &r[2], &r[5]);
2882
2883   fetch_source(mach, &r[2], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2884
2885   micro_mul(&r[3], &r[3], &r[2]);
2886
2887   fetch_source(mach, &r[5], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2888
2889   micro_mul(&r[1], &r[1], &r[5]);
2890   micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2891
2892   micro_mul(&r[5], &r[5], &r[4]);
2893   micro_mul(&r[0], &r[0], &r[2]);
2894   micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2895
2896   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2897      store_dest(mach, &d[CHAN_X], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2898   }
2899   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2900      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2901   }
2902   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2903      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2904   }
2905   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2906      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2907   }
2908}
2909
2910static void
2911exec_dst(struct tgsi_exec_machine *mach,
2912         const struct tgsi_full_instruction *inst)
2913{
2914   union tgsi_exec_channel r[2];
2915   union tgsi_exec_channel d[4];
2916
2917   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2918      fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2919      fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2920      micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2921   }
2922   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2923      fetch_source(mach, &d[CHAN_Z], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2924   }
2925   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2926      fetch_source(mach, &d[CHAN_W], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2927   }
2928
2929   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2930      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2931   }
2932   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2933      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2934   }
2935   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2936      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2937   }
2938   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2939      store_dest(mach, &d[CHAN_W], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2940   }
2941}
2942
2943static void
2944exec_log(struct tgsi_exec_machine *mach,
2945         const struct tgsi_full_instruction *inst)
2946{
2947   union tgsi_exec_channel r[3];
2948
2949   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2950   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2951   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2952   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2953   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2954      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2955   }
2956   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2957      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2958      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2959      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2960   }
2961   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2962      store_dest(mach, &r[1], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2963   }
2964   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2965      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2966   }
2967}
2968
2969static void
2970exec_exp(struct tgsi_exec_machine *mach,
2971         const struct tgsi_full_instruction *inst)
2972{
2973   union tgsi_exec_channel r[3];
2974
2975   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2976   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2977   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2978      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2979      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2980   }
2981   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2982      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2983      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2984   }
2985   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2986      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2987      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2988   }
2989   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2990      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2991   }
2992}
2993
2994static void
2995exec_lit(struct tgsi_exec_machine *mach,
2996         const struct tgsi_full_instruction *inst)
2997{
2998   union tgsi_exec_channel r[3];
2999   union tgsi_exec_channel d[3];
3000
3001   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3002      fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
3003      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3004         fetch_source(mach, &r[1], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3005         micro_max(&r[1], &r[1], &ZeroVec);
3006
3007         fetch_source(mach, &r[2], &inst->Src[0], CHAN_W, TGSI_EXEC_DATA_FLOAT);
3008         micro_min(&r[2], &r[2], &P128Vec);
3009         micro_max(&r[2], &r[2], &M128Vec);
3010         micro_pow(&r[1], &r[1], &r[2]);
3011         micro_lt(&d[CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3012         store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3013      }
3014      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3015         micro_max(&d[CHAN_Y], &r[0], &ZeroVec);
3016         store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3017      }
3018   }
3019   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3020      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
3021   }
3022
3023   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3024      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
3025   }
3026}
3027
3028static void
3029exec_break(struct tgsi_exec_machine *mach)
3030{
3031   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3032      /* turn off loop channels for each enabled exec channel */
3033      mach->LoopMask &= ~mach->ExecMask;
3034      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3035      UPDATE_EXEC_MASK(mach);
3036   } else {
3037      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3038
3039      mach->Switch.mask = 0x0;
3040
3041      UPDATE_EXEC_MASK(mach);
3042   }
3043}
3044
3045static void
3046exec_switch(struct tgsi_exec_machine *mach,
3047            const struct tgsi_full_instruction *inst)
3048{
3049   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3050   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3051
3052   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3053   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
3054   mach->Switch.mask = 0x0;
3055   mach->Switch.defaultMask = 0x0;
3056
3057   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3058   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3059
3060   UPDATE_EXEC_MASK(mach);
3061}
3062
3063static void
3064exec_case(struct tgsi_exec_machine *mach,
3065          const struct tgsi_full_instruction *inst)
3066{
3067   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3068   union tgsi_exec_channel src;
3069   uint mask = 0;
3070
3071   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
3072
3073   if (mach->Switch.selector.u[0] == src.u[0]) {
3074      mask |= 0x1;
3075   }
3076   if (mach->Switch.selector.u[1] == src.u[1]) {
3077      mask |= 0x2;
3078   }
3079   if (mach->Switch.selector.u[2] == src.u[2]) {
3080      mask |= 0x4;
3081   }
3082   if (mach->Switch.selector.u[3] == src.u[3]) {
3083      mask |= 0x8;
3084   }
3085
3086   mach->Switch.defaultMask |= mask;
3087
3088   mach->Switch.mask |= mask & prevMask;
3089
3090   UPDATE_EXEC_MASK(mach);
3091}
3092
3093static void
3094exec_default(struct tgsi_exec_machine *mach)
3095{
3096   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3097
3098   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3099
3100   UPDATE_EXEC_MASK(mach);
3101}
3102
3103static void
3104exec_endswitch(struct tgsi_exec_machine *mach)
3105{
3106   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3107   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3108
3109   UPDATE_EXEC_MASK(mach);
3110}
3111
3112static void
3113micro_i2f(union tgsi_exec_channel *dst,
3114          const union tgsi_exec_channel *src)
3115{
3116   dst->f[0] = (float)src->i[0];
3117   dst->f[1] = (float)src->i[1];
3118   dst->f[2] = (float)src->i[2];
3119   dst->f[3] = (float)src->i[3];
3120}
3121
3122static void
3123micro_not(union tgsi_exec_channel *dst,
3124          const union tgsi_exec_channel *src)
3125{
3126   dst->u[0] = ~src->u[0];
3127   dst->u[1] = ~src->u[1];
3128   dst->u[2] = ~src->u[2];
3129   dst->u[3] = ~src->u[3];
3130}
3131
3132static void
3133micro_shl(union tgsi_exec_channel *dst,
3134          const union tgsi_exec_channel *src0,
3135          const union tgsi_exec_channel *src1)
3136{
3137   dst->u[0] = src0->u[0] << src1->u[0];
3138   dst->u[1] = src0->u[1] << src1->u[1];
3139   dst->u[2] = src0->u[2] << src1->u[2];
3140   dst->u[3] = src0->u[3] << src1->u[3];
3141}
3142
3143static void
3144micro_and(union tgsi_exec_channel *dst,
3145          const union tgsi_exec_channel *src0,
3146          const union tgsi_exec_channel *src1)
3147{
3148   dst->u[0] = src0->u[0] & src1->u[0];
3149   dst->u[1] = src0->u[1] & src1->u[1];
3150   dst->u[2] = src0->u[2] & src1->u[2];
3151   dst->u[3] = src0->u[3] & src1->u[3];
3152}
3153
3154static void
3155micro_or(union tgsi_exec_channel *dst,
3156         const union tgsi_exec_channel *src0,
3157         const union tgsi_exec_channel *src1)
3158{
3159   dst->u[0] = src0->u[0] | src1->u[0];
3160   dst->u[1] = src0->u[1] | src1->u[1];
3161   dst->u[2] = src0->u[2] | src1->u[2];
3162   dst->u[3] = src0->u[3] | src1->u[3];
3163}
3164
3165static void
3166micro_xor(union tgsi_exec_channel *dst,
3167          const union tgsi_exec_channel *src0,
3168          const union tgsi_exec_channel *src1)
3169{
3170   dst->u[0] = src0->u[0] ^ src1->u[0];
3171   dst->u[1] = src0->u[1] ^ src1->u[1];
3172   dst->u[2] = src0->u[2] ^ src1->u[2];
3173   dst->u[3] = src0->u[3] ^ src1->u[3];
3174}
3175
3176static void
3177micro_mod(union tgsi_exec_channel *dst,
3178          const union tgsi_exec_channel *src0,
3179          const union tgsi_exec_channel *src1)
3180{
3181   dst->i[0] = src0->i[0] % src1->i[0];
3182   dst->i[1] = src0->i[1] % src1->i[1];
3183   dst->i[2] = src0->i[2] % src1->i[2];
3184   dst->i[3] = src0->i[3] % src1->i[3];
3185}
3186
3187static void
3188micro_f2i(union tgsi_exec_channel *dst,
3189          const union tgsi_exec_channel *src)
3190{
3191   dst->i[0] = (int)src->f[0];
3192   dst->i[1] = (int)src->f[1];
3193   dst->i[2] = (int)src->f[2];
3194   dst->i[3] = (int)src->f[3];
3195}
3196
3197static void
3198micro_idiv(union tgsi_exec_channel *dst,
3199           const union tgsi_exec_channel *src0,
3200           const union tgsi_exec_channel *src1)
3201{
3202   dst->i[0] = src0->i[0] / src1->i[0];
3203   dst->i[1] = src0->i[1] / src1->i[1];
3204   dst->i[2] = src0->i[2] / src1->i[2];
3205   dst->i[3] = src0->i[3] / src1->i[3];
3206}
3207
3208static void
3209micro_imax(union tgsi_exec_channel *dst,
3210           const union tgsi_exec_channel *src0,
3211           const union tgsi_exec_channel *src1)
3212{
3213   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
3214   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
3215   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
3216   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
3217}
3218
3219static void
3220micro_imin(union tgsi_exec_channel *dst,
3221           const union tgsi_exec_channel *src0,
3222           const union tgsi_exec_channel *src1)
3223{
3224   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
3225   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
3226   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
3227   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
3228}
3229
3230static void
3231micro_isge(union tgsi_exec_channel *dst,
3232           const union tgsi_exec_channel *src0,
3233           const union tgsi_exec_channel *src1)
3234{
3235   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
3236   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
3237   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
3238   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
3239}
3240
3241static void
3242micro_ishr(union tgsi_exec_channel *dst,
3243           const union tgsi_exec_channel *src0,
3244           const union tgsi_exec_channel *src1)
3245{
3246   dst->i[0] = src0->i[0] >> src1->i[0];
3247   dst->i[1] = src0->i[1] >> src1->i[1];
3248   dst->i[2] = src0->i[2] >> src1->i[2];
3249   dst->i[3] = src0->i[3] >> src1->i[3];
3250}
3251
3252static void
3253micro_islt(union tgsi_exec_channel *dst,
3254           const union tgsi_exec_channel *src0,
3255           const union tgsi_exec_channel *src1)
3256{
3257   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3258   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3259   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3260   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3261}
3262
3263static void
3264micro_f2u(union tgsi_exec_channel *dst,
3265          const union tgsi_exec_channel *src)
3266{
3267   dst->u[0] = (uint)src->f[0];
3268   dst->u[1] = (uint)src->f[1];
3269   dst->u[2] = (uint)src->f[2];
3270   dst->u[3] = (uint)src->f[3];
3271}
3272
3273static void
3274micro_u2f(union tgsi_exec_channel *dst,
3275          const union tgsi_exec_channel *src)
3276{
3277   dst->f[0] = (float)src->u[0];
3278   dst->f[1] = (float)src->u[1];
3279   dst->f[2] = (float)src->u[2];
3280   dst->f[3] = (float)src->u[3];
3281}
3282
3283static void
3284micro_uadd(union tgsi_exec_channel *dst,
3285           const union tgsi_exec_channel *src0,
3286           const union tgsi_exec_channel *src1)
3287{
3288   dst->u[0] = src0->u[0] + src1->u[0];
3289   dst->u[1] = src0->u[1] + src1->u[1];
3290   dst->u[2] = src0->u[2] + src1->u[2];
3291   dst->u[3] = src0->u[3] + src1->u[3];
3292}
3293
3294static void
3295micro_udiv(union tgsi_exec_channel *dst,
3296           const union tgsi_exec_channel *src0,
3297           const union tgsi_exec_channel *src1)
3298{
3299   dst->u[0] = src0->u[0] / src1->u[0];
3300   dst->u[1] = src0->u[1] / src1->u[1];
3301   dst->u[2] = src0->u[2] / src1->u[2];
3302   dst->u[3] = src0->u[3] / src1->u[3];
3303}
3304
3305static void
3306micro_umad(union tgsi_exec_channel *dst,
3307           const union tgsi_exec_channel *src0,
3308           const union tgsi_exec_channel *src1,
3309           const union tgsi_exec_channel *src2)
3310{
3311   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3312   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3313   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3314   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3315}
3316
3317static void
3318micro_umax(union tgsi_exec_channel *dst,
3319           const union tgsi_exec_channel *src0,
3320           const union tgsi_exec_channel *src1)
3321{
3322   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3323   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3324   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3325   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3326}
3327
3328static void
3329micro_umin(union tgsi_exec_channel *dst,
3330           const union tgsi_exec_channel *src0,
3331           const union tgsi_exec_channel *src1)
3332{
3333   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3334   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3335   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3336   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3337}
3338
3339static void
3340micro_umod(union tgsi_exec_channel *dst,
3341           const union tgsi_exec_channel *src0,
3342           const union tgsi_exec_channel *src1)
3343{
3344   dst->u[0] = src0->u[0] % src1->u[0];
3345   dst->u[1] = src0->u[1] % src1->u[1];
3346   dst->u[2] = src0->u[2] % src1->u[2];
3347   dst->u[3] = src0->u[3] % src1->u[3];
3348}
3349
3350static void
3351micro_umul(union tgsi_exec_channel *dst,
3352           const union tgsi_exec_channel *src0,
3353           const union tgsi_exec_channel *src1)
3354{
3355   dst->u[0] = src0->u[0] * src1->u[0];
3356   dst->u[1] = src0->u[1] * src1->u[1];
3357   dst->u[2] = src0->u[2] * src1->u[2];
3358   dst->u[3] = src0->u[3] * src1->u[3];
3359}
3360
3361static void
3362micro_useq(union tgsi_exec_channel *dst,
3363           const union tgsi_exec_channel *src0,
3364           const union tgsi_exec_channel *src1)
3365{
3366   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3367   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3368   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3369   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3370}
3371
3372static void
3373micro_usge(union tgsi_exec_channel *dst,
3374           const union tgsi_exec_channel *src0,
3375           const union tgsi_exec_channel *src1)
3376{
3377   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3378   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3379   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3380   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3381}
3382
3383static void
3384micro_ushr(union tgsi_exec_channel *dst,
3385           const union tgsi_exec_channel *src0,
3386           const union tgsi_exec_channel *src1)
3387{
3388   dst->u[0] = src0->u[0] >> src1->u[0];
3389   dst->u[1] = src0->u[1] >> src1->u[1];
3390   dst->u[2] = src0->u[2] >> src1->u[2];
3391   dst->u[3] = src0->u[3] >> src1->u[3];
3392}
3393
3394static void
3395micro_uslt(union tgsi_exec_channel *dst,
3396           const union tgsi_exec_channel *src0,
3397           const union tgsi_exec_channel *src1)
3398{
3399   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3400   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3401   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3402   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3403}
3404
3405static void
3406micro_usne(union tgsi_exec_channel *dst,
3407           const union tgsi_exec_channel *src0,
3408           const union tgsi_exec_channel *src1)
3409{
3410   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3411   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3412   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3413   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3414}
3415
3416static void
3417micro_uarl(union tgsi_exec_channel *dst,
3418           const union tgsi_exec_channel *src)
3419{
3420   dst->i[0] = src->u[0];
3421   dst->i[1] = src->u[1];
3422   dst->i[2] = src->u[2];
3423   dst->i[3] = src->u[3];
3424}
3425
3426static void
3427micro_ucmp(union tgsi_exec_channel *dst,
3428           const union tgsi_exec_channel *src0,
3429           const union tgsi_exec_channel *src1,
3430           const union tgsi_exec_channel *src2)
3431{
3432   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
3433   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
3434   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
3435   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
3436}
3437
3438static void
3439exec_instruction(
3440   struct tgsi_exec_machine *mach,
3441   const struct tgsi_full_instruction *inst,
3442   int *pc )
3443{
3444   union tgsi_exec_channel r[10];
3445
3446   (*pc)++;
3447
3448   switch (inst->Instruction.Opcode) {
3449   case TGSI_OPCODE_ARL:
3450      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3451      break;
3452
3453   case TGSI_OPCODE_MOV:
3454      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3455      break;
3456
3457   case TGSI_OPCODE_LIT:
3458      exec_lit(mach, inst);
3459      break;
3460
3461   case TGSI_OPCODE_RCP:
3462      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3463      break;
3464
3465   case TGSI_OPCODE_RSQ:
3466      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3467      break;
3468
3469   case TGSI_OPCODE_EXP:
3470      exec_exp(mach, inst);
3471      break;
3472
3473   case TGSI_OPCODE_LOG:
3474      exec_log(mach, inst);
3475      break;
3476
3477   case TGSI_OPCODE_MUL:
3478      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3479      break;
3480
3481   case TGSI_OPCODE_ADD:
3482      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3483      break;
3484
3485   case TGSI_OPCODE_DP3:
3486      exec_dp3(mach, inst);
3487      break;
3488
3489   case TGSI_OPCODE_DP4:
3490      exec_dp4(mach, inst);
3491      break;
3492
3493   case TGSI_OPCODE_DST:
3494      exec_dst(mach, inst);
3495      break;
3496
3497   case TGSI_OPCODE_MIN:
3498      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3499      break;
3500
3501   case TGSI_OPCODE_MAX:
3502      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3503      break;
3504
3505   case TGSI_OPCODE_SLT:
3506      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3507      break;
3508
3509   case TGSI_OPCODE_SGE:
3510      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3511      break;
3512
3513   case TGSI_OPCODE_MAD:
3514      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3515      break;
3516
3517   case TGSI_OPCODE_SUB:
3518      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3519      break;
3520
3521   case TGSI_OPCODE_LRP:
3522      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3523      break;
3524
3525   case TGSI_OPCODE_CND:
3526      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3527      break;
3528
3529   case TGSI_OPCODE_DP2A:
3530      exec_dp2a(mach, inst);
3531      break;
3532
3533   case TGSI_OPCODE_FRC:
3534      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3535      break;
3536
3537   case TGSI_OPCODE_CLAMP:
3538      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3539      break;
3540
3541   case TGSI_OPCODE_FLR:
3542      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3543      break;
3544
3545   case TGSI_OPCODE_ROUND:
3546      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3547      break;
3548
3549   case TGSI_OPCODE_EX2:
3550      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3551      break;
3552
3553   case TGSI_OPCODE_LG2:
3554      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3555      break;
3556
3557   case TGSI_OPCODE_POW:
3558      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3559      break;
3560
3561   case TGSI_OPCODE_XPD:
3562      exec_xpd(mach, inst);
3563      break;
3564
3565   case TGSI_OPCODE_ABS:
3566      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3567      break;
3568
3569   case TGSI_OPCODE_RCC:
3570      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3571      break;
3572
3573   case TGSI_OPCODE_DPH:
3574      exec_dph(mach, inst);
3575      break;
3576
3577   case TGSI_OPCODE_COS:
3578      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3579      break;
3580
3581   case TGSI_OPCODE_DDX:
3582      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3583      break;
3584
3585   case TGSI_OPCODE_DDY:
3586      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3587      break;
3588
3589   case TGSI_OPCODE_KILP:
3590      exec_kilp (mach, inst);
3591      break;
3592
3593   case TGSI_OPCODE_KIL:
3594      exec_kil (mach, inst);
3595      break;
3596
3597   case TGSI_OPCODE_PK2H:
3598      assert (0);
3599      break;
3600
3601   case TGSI_OPCODE_PK2US:
3602      assert (0);
3603      break;
3604
3605   case TGSI_OPCODE_PK4B:
3606      assert (0);
3607      break;
3608
3609   case TGSI_OPCODE_PK4UB:
3610      assert (0);
3611      break;
3612
3613   case TGSI_OPCODE_RFL:
3614      exec_rfl(mach, inst);
3615      break;
3616
3617   case TGSI_OPCODE_SEQ:
3618      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3619      break;
3620
3621   case TGSI_OPCODE_SFL:
3622      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3623      break;
3624
3625   case TGSI_OPCODE_SGT:
3626      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3627      break;
3628
3629   case TGSI_OPCODE_SIN:
3630      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3631      break;
3632
3633   case TGSI_OPCODE_SLE:
3634      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3635      break;
3636
3637   case TGSI_OPCODE_SNE:
3638      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3639      break;
3640
3641   case TGSI_OPCODE_STR:
3642      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3643      break;
3644
3645   case TGSI_OPCODE_TEX:
3646      /* simple texture lookup */
3647      /* src[0] = texcoord */
3648      /* src[1] = sampler unit */
3649      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3650      break;
3651
3652   case TGSI_OPCODE_TXB:
3653      /* Texture lookup with lod bias */
3654      /* src[0] = texcoord (src[0].w = LOD bias) */
3655      /* src[1] = sampler unit */
3656      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3657      break;
3658
3659   case TGSI_OPCODE_TXD:
3660      /* Texture lookup with explict partial derivatives */
3661      /* src[0] = texcoord */
3662      /* src[1] = d[strq]/dx */
3663      /* src[2] = d[strq]/dy */
3664      /* src[3] = sampler unit */
3665      exec_txd(mach, inst);
3666      break;
3667
3668   case TGSI_OPCODE_TXL:
3669      /* Texture lookup with explit LOD */
3670      /* src[0] = texcoord (src[0].w = LOD) */
3671      /* src[1] = sampler unit */
3672      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3673      break;
3674
3675   case TGSI_OPCODE_TXP:
3676      /* Texture lookup with projection */
3677      /* src[0] = texcoord (src[0].w = projection) */
3678      /* src[1] = sampler unit */
3679      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3680      break;
3681
3682   case TGSI_OPCODE_UP2H:
3683      assert (0);
3684      break;
3685
3686   case TGSI_OPCODE_UP2US:
3687      assert (0);
3688      break;
3689
3690   case TGSI_OPCODE_UP4B:
3691      assert (0);
3692      break;
3693
3694   case TGSI_OPCODE_UP4UB:
3695      assert (0);
3696      break;
3697
3698   case TGSI_OPCODE_X2D:
3699      exec_x2d(mach, inst);
3700      break;
3701
3702   case TGSI_OPCODE_ARA:
3703      assert (0);
3704      break;
3705
3706   case TGSI_OPCODE_ARR:
3707      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3708      break;
3709
3710   case TGSI_OPCODE_BRA:
3711      assert (0);
3712      break;
3713
3714   case TGSI_OPCODE_CAL:
3715      /* skip the call if no execution channels are enabled */
3716      if (mach->ExecMask) {
3717         /* do the call */
3718
3719         /* First, record the depths of the execution stacks.
3720          * This is important for deeply nested/looped return statements.
3721          * We have to unwind the stacks by the correct amount.  For a
3722          * real code generator, we could determine the number of entries
3723          * to pop off each stack with simple static analysis and avoid
3724          * implementing this data structure at run time.
3725          */
3726         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3727         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3728         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3729         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3730         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3731         /* note that PC was already incremented above */
3732         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3733
3734         mach->CallStackTop++;
3735
3736         /* Second, push the Cond, Loop, Cont, Func stacks */
3737         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3738         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3739         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3740         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3741         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3742         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3743
3744         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3745         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3746         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3747         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3748         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3749         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3750
3751         /* Finally, jump to the subroutine */
3752         *pc = inst->Label.Label;
3753      }
3754      break;
3755
3756   case TGSI_OPCODE_RET:
3757      mach->FuncMask &= ~mach->ExecMask;
3758      UPDATE_EXEC_MASK(mach);
3759
3760      if (mach->FuncMask == 0x0) {
3761         /* really return now (otherwise, keep executing */
3762
3763         if (mach->CallStackTop == 0) {
3764            /* returning from main() */
3765            mach->CondStackTop = 0;
3766            mach->LoopStackTop = 0;
3767            *pc = -1;
3768            return;
3769         }
3770
3771         assert(mach->CallStackTop > 0);
3772         mach->CallStackTop--;
3773
3774         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3775         mach->CondMask = mach->CondStack[mach->CondStackTop];
3776
3777         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3778         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3779
3780         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3781         mach->ContMask = mach->ContStack[mach->ContStackTop];
3782
3783         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3784         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3785
3786         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3787         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3788
3789         assert(mach->FuncStackTop > 0);
3790         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3791
3792         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3793
3794         UPDATE_EXEC_MASK(mach);
3795      }
3796      break;
3797
3798   case TGSI_OPCODE_SSG:
3799      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3800      break;
3801
3802   case TGSI_OPCODE_CMP:
3803      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3804      break;
3805
3806   case TGSI_OPCODE_SCS:
3807      exec_scs(mach, inst);
3808      break;
3809
3810   case TGSI_OPCODE_NRM:
3811      exec_nrm3(mach, inst);
3812      break;
3813
3814   case TGSI_OPCODE_NRM4:
3815      exec_nrm4(mach, inst);
3816      break;
3817
3818   case TGSI_OPCODE_DIV:
3819      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3820      break;
3821
3822   case TGSI_OPCODE_DP2:
3823      exec_dp2(mach, inst);
3824      break;
3825
3826   case TGSI_OPCODE_IF:
3827      /* push CondMask */
3828      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3829      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3830      FETCH( &r[0], 0, CHAN_X );
3831      /* update CondMask */
3832      if( ! r[0].u[0] ) {
3833         mach->CondMask &= ~0x1;
3834      }
3835      if( ! r[0].u[1] ) {
3836         mach->CondMask &= ~0x2;
3837      }
3838      if( ! r[0].u[2] ) {
3839         mach->CondMask &= ~0x4;
3840      }
3841      if( ! r[0].u[3] ) {
3842         mach->CondMask &= ~0x8;
3843      }
3844      UPDATE_EXEC_MASK(mach);
3845      /* Todo: If CondMask==0, jump to ELSE */
3846      break;
3847
3848   case TGSI_OPCODE_ELSE:
3849      /* invert CondMask wrt previous mask */
3850      {
3851         uint prevMask;
3852         assert(mach->CondStackTop > 0);
3853         prevMask = mach->CondStack[mach->CondStackTop - 1];
3854         mach->CondMask = ~mach->CondMask & prevMask;
3855         UPDATE_EXEC_MASK(mach);
3856         /* Todo: If CondMask==0, jump to ENDIF */
3857      }
3858      break;
3859
3860   case TGSI_OPCODE_ENDIF:
3861      /* pop CondMask */
3862      assert(mach->CondStackTop > 0);
3863      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3864      UPDATE_EXEC_MASK(mach);
3865      break;
3866
3867   case TGSI_OPCODE_END:
3868      /* make sure we end primitives which haven't
3869       * been explicitly emitted */
3870      conditional_emit_primitive(mach);
3871      /* halt execution */
3872      *pc = -1;
3873      break;
3874
3875   case TGSI_OPCODE_PUSHA:
3876      assert (0);
3877      break;
3878
3879   case TGSI_OPCODE_POPA:
3880      assert (0);
3881      break;
3882
3883   case TGSI_OPCODE_CEIL:
3884      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3885      break;
3886
3887   case TGSI_OPCODE_I2F:
3888      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3889      break;
3890
3891   case TGSI_OPCODE_NOT:
3892      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3893      break;
3894
3895   case TGSI_OPCODE_TRUNC:
3896      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3897      break;
3898
3899   case TGSI_OPCODE_SHL:
3900      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3901      break;
3902
3903   case TGSI_OPCODE_AND:
3904      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3905      break;
3906
3907   case TGSI_OPCODE_OR:
3908      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3909      break;
3910
3911   case TGSI_OPCODE_MOD:
3912      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3913      break;
3914
3915   case TGSI_OPCODE_XOR:
3916      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3917      break;
3918
3919   case TGSI_OPCODE_SAD:
3920      assert (0);
3921      break;
3922
3923   case TGSI_OPCODE_TXF:
3924      exec_txf(mach, inst);
3925      break;
3926
3927   case TGSI_OPCODE_TXQ:
3928      exec_txq(mach, inst);
3929      break;
3930
3931   case TGSI_OPCODE_EMIT:
3932      emit_vertex(mach);
3933      break;
3934
3935   case TGSI_OPCODE_ENDPRIM:
3936      emit_primitive(mach);
3937      break;
3938
3939   case TGSI_OPCODE_BGNLOOP:
3940      /* push LoopMask and ContMasks */
3941      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3942      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3943      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3944      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3945
3946      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3947      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3948      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3949      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3950      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3951      break;
3952
3953   case TGSI_OPCODE_ENDLOOP:
3954      /* Restore ContMask, but don't pop */
3955      assert(mach->ContStackTop > 0);
3956      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3957      UPDATE_EXEC_MASK(mach);
3958      if (mach->ExecMask) {
3959         /* repeat loop: jump to instruction just past BGNLOOP */
3960         assert(mach->LoopLabelStackTop > 0);
3961         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3962      }
3963      else {
3964         /* exit loop: pop LoopMask */
3965         assert(mach->LoopStackTop > 0);
3966         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3967         /* pop ContMask */
3968         assert(mach->ContStackTop > 0);
3969         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3970         assert(mach->LoopLabelStackTop > 0);
3971         --mach->LoopLabelStackTop;
3972
3973         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3974      }
3975      UPDATE_EXEC_MASK(mach);
3976      break;
3977
3978   case TGSI_OPCODE_BRK:
3979      exec_break(mach);
3980      break;
3981
3982   case TGSI_OPCODE_CONT:
3983      /* turn off cont channels for each enabled exec channel */
3984      mach->ContMask &= ~mach->ExecMask;
3985      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3986      UPDATE_EXEC_MASK(mach);
3987      break;
3988
3989   case TGSI_OPCODE_BGNSUB:
3990      /* no-op */
3991      break;
3992
3993   case TGSI_OPCODE_ENDSUB:
3994      /*
3995       * XXX: This really should be a no-op. We should never reach this opcode.
3996       */
3997
3998      assert(mach->CallStackTop > 0);
3999      mach->CallStackTop--;
4000
4001      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
4002      mach->CondMask = mach->CondStack[mach->CondStackTop];
4003
4004      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
4005      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
4006
4007      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
4008      mach->ContMask = mach->ContStack[mach->ContStackTop];
4009
4010      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
4011      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
4012
4013      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
4014      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
4015
4016      assert(mach->FuncStackTop > 0);
4017      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
4018
4019      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
4020
4021      UPDATE_EXEC_MASK(mach);
4022      break;
4023
4024   case TGSI_OPCODE_NOP:
4025      break;
4026
4027   case TGSI_OPCODE_BREAKC:
4028      FETCH(&r[0], 0, CHAN_X);
4029      /* update CondMask */
4030      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
4031         mach->LoopMask &= ~0x1;
4032      }
4033      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
4034         mach->LoopMask &= ~0x2;
4035      }
4036      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
4037         mach->LoopMask &= ~0x4;
4038      }
4039      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
4040         mach->LoopMask &= ~0x8;
4041      }
4042      /* Todo: if mach->LoopMask == 0, jump to end of loop */
4043      UPDATE_EXEC_MASK(mach);
4044      break;
4045
4046   case TGSI_OPCODE_F2I:
4047      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
4048      break;
4049
4050   case TGSI_OPCODE_IDIV:
4051      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4052      break;
4053
4054   case TGSI_OPCODE_IMAX:
4055      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4056      break;
4057
4058   case TGSI_OPCODE_IMIN:
4059      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4060      break;
4061
4062   case TGSI_OPCODE_INEG:
4063      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4064      break;
4065
4066   case TGSI_OPCODE_ISGE:
4067      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4068      break;
4069
4070   case TGSI_OPCODE_ISHR:
4071      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4072      break;
4073
4074   case TGSI_OPCODE_ISLT:
4075      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4076      break;
4077
4078   case TGSI_OPCODE_F2U:
4079      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
4080      break;
4081
4082   case TGSI_OPCODE_U2F:
4083      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
4084      break;
4085
4086   case TGSI_OPCODE_UADD:
4087      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4088      break;
4089
4090   case TGSI_OPCODE_UDIV:
4091      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4092      break;
4093
4094   case TGSI_OPCODE_UMAD:
4095      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4096      break;
4097
4098   case TGSI_OPCODE_UMAX:
4099      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4100      break;
4101
4102   case TGSI_OPCODE_UMIN:
4103      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4104      break;
4105
4106   case TGSI_OPCODE_UMOD:
4107      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4108      break;
4109
4110   case TGSI_OPCODE_UMUL:
4111      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4112      break;
4113
4114   case TGSI_OPCODE_USEQ:
4115      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4116      break;
4117
4118   case TGSI_OPCODE_USGE:
4119      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4120      break;
4121
4122   case TGSI_OPCODE_USHR:
4123      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4124      break;
4125
4126   case TGSI_OPCODE_USLT:
4127      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4128      break;
4129
4130   case TGSI_OPCODE_USNE:
4131      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4132      break;
4133
4134   case TGSI_OPCODE_SWITCH:
4135      exec_switch(mach, inst);
4136      break;
4137
4138   case TGSI_OPCODE_CASE:
4139      exec_case(mach, inst);
4140      break;
4141
4142   case TGSI_OPCODE_DEFAULT:
4143      exec_default(mach);
4144      break;
4145
4146   case TGSI_OPCODE_ENDSWITCH:
4147      exec_endswitch(mach);
4148      break;
4149
4150   case TGSI_OPCODE_LOAD:
4151      assert(0);
4152      break;
4153
4154   case TGSI_OPCODE_LOAD_MS:
4155      assert(0);
4156      break;
4157
4158   case TGSI_OPCODE_SAMPLE:
4159      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4160      break;
4161
4162   case TGSI_OPCODE_SAMPLE_B:
4163      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4164      break;
4165
4166   case TGSI_OPCODE_SAMPLE_C:
4167      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4168      break;
4169
4170   case TGSI_OPCODE_SAMPLE_C_LZ:
4171      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4172      break;
4173
4174   case TGSI_OPCODE_SAMPLE_D:
4175      exec_sample_d(mach, inst);
4176      break;
4177
4178   case TGSI_OPCODE_SAMPLE_L:
4179      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
4180      break;
4181
4182   case TGSI_OPCODE_GATHER4:
4183      assert(0);
4184      break;
4185
4186   case TGSI_OPCODE_RESINFO:
4187      assert(0);
4188      break;
4189
4190   case TGSI_OPCODE_SAMPLE_POS:
4191      assert(0);
4192      break;
4193
4194   case TGSI_OPCODE_SAMPLE_INFO:
4195      assert(0);
4196      break;
4197
4198   case TGSI_OPCODE_UARL:
4199      exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
4200      break;
4201
4202   case TGSI_OPCODE_UCMP:
4203      exec_vector_trinary(mach, inst, micro_ucmp, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4204      break;
4205
4206   case TGSI_OPCODE_IABS:
4207      exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4208      break;
4209
4210   default:
4211      assert( 0 );
4212   }
4213}
4214
4215
4216#define DEBUG_EXECUTION 0
4217
4218
4219/**
4220 * Run TGSI interpreter.
4221 * \return bitmask of "alive" quad components
4222 */
4223uint
4224tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
4225{
4226   uint i;
4227   int pc = 0;
4228
4229   mach->CondMask = 0xf;
4230   mach->LoopMask = 0xf;
4231   mach->ContMask = 0xf;
4232   mach->FuncMask = 0xf;
4233   mach->ExecMask = 0xf;
4234
4235   mach->Switch.mask = 0xf;
4236
4237   assert(mach->CondStackTop == 0);
4238   assert(mach->LoopStackTop == 0);
4239   assert(mach->ContStackTop == 0);
4240   assert(mach->SwitchStackTop == 0);
4241   assert(mach->BreakStackTop == 0);
4242   assert(mach->CallStackTop == 0);
4243
4244   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
4245   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
4246
4247   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
4248      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
4249      mach->Primitives[0] = 0;
4250   }
4251
4252   /* execute declarations (interpolants) */
4253   for (i = 0; i < mach->NumDeclarations; i++) {
4254      exec_declaration( mach, mach->Declarations+i );
4255   }
4256
4257   {
4258#if DEBUG_EXECUTION
4259      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
4260      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
4261      uint inst = 1;
4262
4263      memcpy(temps, mach->Temps, sizeof(temps));
4264      memcpy(outputs, mach->Outputs, sizeof(outputs));
4265#endif
4266
4267      /* execute instructions, until pc is set to -1 */
4268      while (pc != -1) {
4269
4270#if DEBUG_EXECUTION
4271         uint i;
4272
4273         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
4274#endif
4275
4276         assert(pc < (int) mach->NumInstructions);
4277         exec_instruction(mach, mach->Instructions + pc, &pc);
4278
4279#if DEBUG_EXECUTION
4280         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4281            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4282               uint j;
4283
4284               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4285               debug_printf("TEMP[%2u] = ", i);
4286               for (j = 0; j < 4; j++) {
4287                  if (j > 0) {
4288                     debug_printf("           ");
4289                  }
4290                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4291                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4292                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4293                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4294                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4295               }
4296            }
4297         }
4298         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4299            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4300               uint j;
4301
4302               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4303               debug_printf("OUT[%2u] =  ", i);
4304               for (j = 0; j < 4; j++) {
4305                  if (j > 0) {
4306                     debug_printf("           ");
4307                  }
4308                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4309                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4310                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4311                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4312                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4313               }
4314            }
4315         }
4316#endif
4317      }
4318   }
4319
4320#if 0
4321   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4322   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4323      /*
4324       * Scale back depth component.
4325       */
4326      for (i = 0; i < 4; i++)
4327         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4328   }
4329#endif
4330
4331   /* Strictly speaking, these assertions aren't really needed but they
4332    * can potentially catch some bugs in the control flow code.
4333    */
4334   assert(mach->CondStackTop == 0);
4335   assert(mach->LoopStackTop == 0);
4336   assert(mach->ContStackTop == 0);
4337   assert(mach->SwitchStackTop == 0);
4338   assert(mach->BreakStackTop == 0);
4339   assert(mach->CallStackTop == 0);
4340
4341   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4342}
4343