tgsi_exec.c revision 0ec30805a4aad945515957e980374f65fbd3b66e
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 0
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_isgn(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src)
380{
381   dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
382   dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
383   dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
384   dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
385}
386
387static void
388micro_sgt(union tgsi_exec_channel *dst,
389          const union tgsi_exec_channel *src0,
390          const union tgsi_exec_channel *src1)
391{
392   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
393   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
394   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
395   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
396}
397
398static void
399micro_sin(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src)
401{
402   dst->f[0] = sinf(src->f[0]);
403   dst->f[1] = sinf(src->f[1]);
404   dst->f[2] = sinf(src->f[2]);
405   dst->f[3] = sinf(src->f[3]);
406}
407
408static void
409micro_sle(union tgsi_exec_channel *dst,
410          const union tgsi_exec_channel *src0,
411          const union tgsi_exec_channel *src1)
412{
413   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
414   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
415   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
416   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
417}
418
419static void
420micro_slt(union tgsi_exec_channel *dst,
421          const union tgsi_exec_channel *src0,
422          const union tgsi_exec_channel *src1)
423{
424   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
425   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
426   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
427   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
428}
429
430static void
431micro_sne(union tgsi_exec_channel *dst,
432          const union tgsi_exec_channel *src0,
433          const union tgsi_exec_channel *src1)
434{
435   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
436   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
437   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
438   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
439}
440
441static void
442micro_sfl(union tgsi_exec_channel *dst)
443{
444   dst->f[0] = 0.0f;
445   dst->f[1] = 0.0f;
446   dst->f[2] = 0.0f;
447   dst->f[3] = 0.0f;
448}
449
450static void
451micro_str(union tgsi_exec_channel *dst)
452{
453   dst->f[0] = 1.0f;
454   dst->f[1] = 1.0f;
455   dst->f[2] = 1.0f;
456   dst->f[3] = 1.0f;
457}
458
459static void
460micro_trunc(union tgsi_exec_channel *dst,
461            const union tgsi_exec_channel *src)
462{
463   dst->f[0] = (float)(int)src->f[0];
464   dst->f[1] = (float)(int)src->f[1];
465   dst->f[2] = (float)(int)src->f[2];
466   dst->f[3] = (float)(int)src->f[3];
467}
468
469
470#define CHAN_X  0
471#define CHAN_Y  1
472#define CHAN_Z  2
473#define CHAN_W  3
474
475enum tgsi_exec_datatype {
476   TGSI_EXEC_DATA_FLOAT,
477   TGSI_EXEC_DATA_INT,
478   TGSI_EXEC_DATA_UINT
479};
480
481/*
482 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
483 */
484#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
485#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
486#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
487#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
488#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
489#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
490
491
492/** The execution mask depends on the conditional mask and the loop mask */
493#define UPDATE_EXEC_MASK(MACH) \
494      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
495
496
497static const union tgsi_exec_channel ZeroVec =
498   { { 0.0, 0.0, 0.0, 0.0 } };
499
500static const union tgsi_exec_channel OneVec = {
501   {1.0f, 1.0f, 1.0f, 1.0f}
502};
503
504static const union tgsi_exec_channel P128Vec = {
505   {128.0f, 128.0f, 128.0f, 128.0f}
506};
507
508static const union tgsi_exec_channel M128Vec = {
509   {-128.0f, -128.0f, -128.0f, -128.0f}
510};
511
512
513/**
514 * Assert that none of the float values in 'chan' are infinite or NaN.
515 * NaN and Inf may occur normally during program execution and should
516 * not lead to crashes, etc.  But when debugging, it's helpful to catch
517 * them.
518 */
519static INLINE void
520check_inf_or_nan(const union tgsi_exec_channel *chan)
521{
522   assert(!util_is_inf_or_nan((chan)->f[0]));
523   assert(!util_is_inf_or_nan((chan)->f[1]));
524   assert(!util_is_inf_or_nan((chan)->f[2]));
525   assert(!util_is_inf_or_nan((chan)->f[3]));
526}
527
528
529#ifdef DEBUG
530static void
531print_chan(const char *msg, const union tgsi_exec_channel *chan)
532{
533   debug_printf("%s = {%f, %f, %f, %f}\n",
534                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
535}
536#endif
537
538
539#ifdef DEBUG
540static void
541print_temp(const struct tgsi_exec_machine *mach, uint index)
542{
543   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
544   int i;
545   debug_printf("Temp[%u] =\n", index);
546   for (i = 0; i < 4; i++) {
547      debug_printf("  %c: { %f, %f, %f, %f }\n",
548                   "XYZW"[i],
549                   tmp->xyzw[i].f[0],
550                   tmp->xyzw[i].f[1],
551                   tmp->xyzw[i].f[2],
552                   tmp->xyzw[i].f[3]);
553   }
554}
555#endif
556
557
558void
559tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
560                               unsigned num_bufs,
561                               const void **bufs,
562                               const unsigned *buf_sizes)
563{
564   unsigned i;
565
566   for (i = 0; i < num_bufs; i++) {
567      mach->Consts[i] = bufs[i];
568      mach->ConstsSize[i] = buf_sizes[i];
569   }
570}
571
572
573/**
574 * Check if there's a potential src/dst register data dependency when
575 * using SOA execution.
576 * Example:
577 *   MOV T, T.yxwz;
578 * This would expand into:
579 *   MOV t0, t1;
580 *   MOV t1, t0;
581 *   MOV t2, t3;
582 *   MOV t3, t2;
583 * The second instruction will have the wrong value for t0 if executed as-is.
584 */
585boolean
586tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
587{
588   uint i, chan;
589
590   uint writemask = inst->Dst[0].Register.WriteMask;
591   if (writemask == TGSI_WRITEMASK_X ||
592       writemask == TGSI_WRITEMASK_Y ||
593       writemask == TGSI_WRITEMASK_Z ||
594       writemask == TGSI_WRITEMASK_W ||
595       writemask == TGSI_WRITEMASK_NONE) {
596      /* no chance of data dependency */
597      return FALSE;
598   }
599
600   /* loop over src regs */
601   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
602      if ((inst->Src[i].Register.File ==
603           inst->Dst[0].Register.File) &&
604          ((inst->Src[i].Register.Index ==
605            inst->Dst[0].Register.Index) ||
606           inst->Src[i].Register.Indirect ||
607           inst->Dst[0].Register.Indirect)) {
608         /* loop over dest channels */
609         uint channelsWritten = 0x0;
610         for (chan = 0; chan < NUM_CHANNELS; chan++) {
611            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
612               /* check if we're reading a channel that's been written */
613               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
614               if (channelsWritten & (1 << swizzle)) {
615                  return TRUE;
616               }
617
618               channelsWritten |= (1 << chan);
619            }
620         }
621      }
622   }
623   return FALSE;
624}
625
626
627/**
628 * Initialize machine state by expanding tokens to full instructions,
629 * allocating temporary storage, setting up constants, etc.
630 * After this, we can call tgsi_exec_machine_run() many times.
631 */
632void
633tgsi_exec_machine_bind_shader(
634   struct tgsi_exec_machine *mach,
635   const struct tgsi_token *tokens,
636   uint numSamplers,
637   struct tgsi_sampler **samplers)
638{
639   uint k;
640   struct tgsi_parse_context parse;
641   struct tgsi_full_instruction *instructions;
642   struct tgsi_full_declaration *declarations;
643   uint maxInstructions = 10, numInstructions = 0;
644   uint maxDeclarations = 10, numDeclarations = 0;
645
646#if 0
647   tgsi_dump(tokens, 0);
648#endif
649
650   util_init_math();
651
652   if (numSamplers) {
653      assert(samplers);
654   }
655
656   mach->Tokens = tokens;
657   mach->Samplers = samplers;
658
659   if (!tokens) {
660      /* unbind and free all */
661      if (mach->Declarations) {
662         FREE( mach->Declarations );
663      }
664      mach->Declarations = NULL;
665      mach->NumDeclarations = 0;
666
667      if (mach->Instructions) {
668         FREE( mach->Instructions );
669      }
670      mach->Instructions = NULL;
671      mach->NumInstructions = 0;
672
673      return;
674   }
675
676   k = tgsi_parse_init (&parse, mach->Tokens);
677   if (k != TGSI_PARSE_OK) {
678      debug_printf( "Problem parsing!\n" );
679      return;
680   }
681
682   mach->Processor = parse.FullHeader.Processor.Processor;
683   mach->ImmLimit = 0;
684
685   if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
686       !mach->UsedGeometryShader) {
687      struct tgsi_exec_vector *inputs;
688      struct tgsi_exec_vector *outputs;
689
690      inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
691                            TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
692                            16);
693
694      if (!inputs)
695         return;
696
697      outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
698                             TGSI_MAX_TOTAL_VERTICES, 16);
699
700      if (!outputs) {
701         align_free(inputs);
702         return;
703      }
704
705      align_free(mach->Inputs);
706      align_free(mach->Outputs);
707
708      mach->Inputs = inputs;
709      mach->Outputs = outputs;
710      mach->UsedGeometryShader = TRUE;
711   }
712
713   declarations = (struct tgsi_full_declaration *)
714      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
715
716   if (!declarations) {
717      return;
718   }
719
720   instructions = (struct tgsi_full_instruction *)
721      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
722
723   if (!instructions) {
724      FREE( declarations );
725      return;
726   }
727
728   while( !tgsi_parse_end_of_tokens( &parse ) ) {
729      uint i;
730
731      tgsi_parse_token( &parse );
732      switch( parse.FullToken.Token.Type ) {
733      case TGSI_TOKEN_TYPE_DECLARATION:
734         /* save expanded declaration */
735         if (numDeclarations == maxDeclarations) {
736            declarations = REALLOC(declarations,
737                                   maxDeclarations
738                                   * sizeof(struct tgsi_full_declaration),
739                                   (maxDeclarations + 10)
740                                   * sizeof(struct tgsi_full_declaration));
741            maxDeclarations += 10;
742         }
743         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
744            unsigned reg;
745            for (reg = parse.FullToken.FullDeclaration.Range.First;
746                 reg <= parse.FullToken.FullDeclaration.Range.Last;
747                 ++reg) {
748               ++mach->NumOutputs;
749            }
750         }
751         if (parse.FullToken.FullDeclaration.Declaration.File ==
752             TGSI_FILE_IMMEDIATE_ARRAY) {
753            unsigned reg;
754            struct tgsi_full_declaration *decl =
755               &parse.FullToken.FullDeclaration;
756            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
757            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
758               for( i = 0; i < 4; i++ ) {
759                  int idx = reg * 4 + i;
760                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
761               }
762            }
763         }
764         memcpy(declarations + numDeclarations,
765                &parse.FullToken.FullDeclaration,
766                sizeof(declarations[0]));
767         numDeclarations++;
768         break;
769
770      case TGSI_TOKEN_TYPE_IMMEDIATE:
771         {
772            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
773            assert( size <= 4 );
774            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
775
776            for( i = 0; i < size; i++ ) {
777               mach->Imms[mach->ImmLimit][i] =
778		  parse.FullToken.FullImmediate.u[i].Float;
779            }
780            mach->ImmLimit += 1;
781         }
782         break;
783
784      case TGSI_TOKEN_TYPE_INSTRUCTION:
785
786         /* save expanded instruction */
787         if (numInstructions == maxInstructions) {
788            instructions = REALLOC(instructions,
789                                   maxInstructions
790                                   * sizeof(struct tgsi_full_instruction),
791                                   (maxInstructions + 10)
792                                   * sizeof(struct tgsi_full_instruction));
793            maxInstructions += 10;
794         }
795
796         memcpy(instructions + numInstructions,
797                &parse.FullToken.FullInstruction,
798                sizeof(instructions[0]));
799
800         numInstructions++;
801         break;
802
803      case TGSI_TOKEN_TYPE_PROPERTY:
804         break;
805
806      default:
807         assert( 0 );
808      }
809   }
810   tgsi_parse_free (&parse);
811
812   if (mach->Declarations) {
813      FREE( mach->Declarations );
814   }
815   mach->Declarations = declarations;
816   mach->NumDeclarations = numDeclarations;
817
818   if (mach->Instructions) {
819      FREE( mach->Instructions );
820   }
821   mach->Instructions = instructions;
822   mach->NumInstructions = numInstructions;
823}
824
825
826struct tgsi_exec_machine *
827tgsi_exec_machine_create( void )
828{
829   struct tgsi_exec_machine *mach;
830   uint i;
831
832   mach = align_malloc( sizeof *mach, 16 );
833   if (!mach)
834      goto fail;
835
836   memset(mach, 0, sizeof(*mach));
837
838   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
839   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
840   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
841
842   mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
843   mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
844   if (!mach->Inputs || !mach->Outputs)
845      goto fail;
846
847   /* Setup constants needed by the SSE2 executor. */
848   for( i = 0; i < 4; i++ ) {
849      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
850      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
851      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
852      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
853      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
854      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
855      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
856      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
857      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
858      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
859   }
860
861#ifdef DEBUG
862   /* silence warnings */
863   (void) print_chan;
864   (void) print_temp;
865#endif
866
867   return mach;
868
869fail:
870   if (mach) {
871      align_free(mach->Inputs);
872      align_free(mach->Outputs);
873      align_free(mach);
874   }
875   return NULL;
876}
877
878
879void
880tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
881{
882   if (mach) {
883      if (mach->Instructions)
884         FREE(mach->Instructions);
885      if (mach->Declarations)
886         FREE(mach->Declarations);
887
888      align_free(mach->Inputs);
889      align_free(mach->Outputs);
890
891      align_free(mach);
892   }
893}
894
895static void
896micro_add(union tgsi_exec_channel *dst,
897          const union tgsi_exec_channel *src0,
898          const union tgsi_exec_channel *src1)
899{
900   dst->f[0] = src0->f[0] + src1->f[0];
901   dst->f[1] = src0->f[1] + src1->f[1];
902   dst->f[2] = src0->f[2] + src1->f[2];
903   dst->f[3] = src0->f[3] + src1->f[3];
904}
905
906static void
907micro_div(
908   union tgsi_exec_channel *dst,
909   const union tgsi_exec_channel *src0,
910   const union tgsi_exec_channel *src1 )
911{
912   if (src1->f[0] != 0) {
913      dst->f[0] = src0->f[0] / src1->f[0];
914   }
915   if (src1->f[1] != 0) {
916      dst->f[1] = src0->f[1] / src1->f[1];
917   }
918   if (src1->f[2] != 0) {
919      dst->f[2] = src0->f[2] / src1->f[2];
920   }
921   if (src1->f[3] != 0) {
922      dst->f[3] = src0->f[3] / src1->f[3];
923   }
924}
925
926static void
927micro_rcc(union tgsi_exec_channel *dst,
928          const union tgsi_exec_channel *src)
929{
930   uint i;
931
932   for (i = 0; i < 4; i++) {
933      float recip = 1.0f / src->f[i];
934
935      if (recip > 0.0f) {
936         if (recip > 1.884467e+019f) {
937            dst->f[i] = 1.884467e+019f;
938         }
939         else if (recip < 5.42101e-020f) {
940            dst->f[i] = 5.42101e-020f;
941         }
942         else {
943            dst->f[i] = recip;
944         }
945      }
946      else {
947         if (recip < -1.884467e+019f) {
948            dst->f[i] = -1.884467e+019f;
949         }
950         else if (recip > -5.42101e-020f) {
951            dst->f[i] = -5.42101e-020f;
952         }
953         else {
954            dst->f[i] = recip;
955         }
956      }
957   }
958}
959
960static void
961micro_lt(
962   union tgsi_exec_channel *dst,
963   const union tgsi_exec_channel *src0,
964   const union tgsi_exec_channel *src1,
965   const union tgsi_exec_channel *src2,
966   const union tgsi_exec_channel *src3 )
967{
968   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
969   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
970   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
971   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
972}
973
974static void
975micro_max(union tgsi_exec_channel *dst,
976          const union tgsi_exec_channel *src0,
977          const union tgsi_exec_channel *src1)
978{
979   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
980   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
981   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
982   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
983}
984
985static void
986micro_min(union tgsi_exec_channel *dst,
987          const union tgsi_exec_channel *src0,
988          const union tgsi_exec_channel *src1)
989{
990   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
991   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
992   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
993   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
994}
995
996static void
997micro_mul(union tgsi_exec_channel *dst,
998          const union tgsi_exec_channel *src0,
999          const union tgsi_exec_channel *src1)
1000{
1001   dst->f[0] = src0->f[0] * src1->f[0];
1002   dst->f[1] = src0->f[1] * src1->f[1];
1003   dst->f[2] = src0->f[2] * src1->f[2];
1004   dst->f[3] = src0->f[3] * src1->f[3];
1005}
1006
1007static void
1008micro_neg(
1009   union tgsi_exec_channel *dst,
1010   const union tgsi_exec_channel *src )
1011{
1012   dst->f[0] = -src->f[0];
1013   dst->f[1] = -src->f[1];
1014   dst->f[2] = -src->f[2];
1015   dst->f[3] = -src->f[3];
1016}
1017
1018static void
1019micro_pow(
1020   union tgsi_exec_channel *dst,
1021   const union tgsi_exec_channel *src0,
1022   const union tgsi_exec_channel *src1 )
1023{
1024#if FAST_MATH
1025   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1026   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1027   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1028   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1029#else
1030   dst->f[0] = powf( src0->f[0], src1->f[0] );
1031   dst->f[1] = powf( src0->f[1], src1->f[1] );
1032   dst->f[2] = powf( src0->f[2], src1->f[2] );
1033   dst->f[3] = powf( src0->f[3], src1->f[3] );
1034#endif
1035}
1036
1037static void
1038micro_sub(union tgsi_exec_channel *dst,
1039          const union tgsi_exec_channel *src0,
1040          const union tgsi_exec_channel *src1)
1041{
1042   dst->f[0] = src0->f[0] - src1->f[0];
1043   dst->f[1] = src0->f[1] - src1->f[1];
1044   dst->f[2] = src0->f[2] - src1->f[2];
1045   dst->f[3] = src0->f[3] - src1->f[3];
1046}
1047
1048static void
1049fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1050                       const uint chan_index,
1051                       const uint file,
1052                       const uint swizzle,
1053                       const union tgsi_exec_channel *index,
1054                       const union tgsi_exec_channel *index2D,
1055                       union tgsi_exec_channel *chan)
1056{
1057   uint i;
1058
1059   assert(swizzle < 4);
1060
1061   switch (file) {
1062   case TGSI_FILE_CONSTANT:
1063      for (i = 0; i < QUAD_SIZE; i++) {
1064         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1065         assert(mach->Consts[index2D->i[i]]);
1066
1067         if (index->i[i] < 0) {
1068            chan->u[i] = 0;
1069         } else {
1070            /* NOTE: copying the const value as a uint instead of float */
1071            const uint constbuf = index2D->i[i];
1072            const uint *buf = (const uint *)mach->Consts[constbuf];
1073            const int pos = index->i[i] * 4 + swizzle;
1074            /* const buffer bounds check */
1075            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1076               if (0) {
1077                  /* Debug: print warning */
1078                  static int count = 0;
1079                  if (count++ < 100)
1080                     debug_printf("TGSI Exec: const buffer index %d"
1081                                  " out of bounds\n", pos);
1082               }
1083               chan->u[i] = 0;
1084            }
1085            else
1086               chan->u[i] = buf[pos];
1087         }
1088      }
1089      break;
1090
1091   case TGSI_FILE_INPUT:
1092      for (i = 0; i < QUAD_SIZE; i++) {
1093         /*
1094         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1095            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1096                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1097                         index2D->i[i], index->i[i]);
1098                         }*/
1099         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1100         assert(pos >= 0);
1101         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1102         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1103      }
1104      break;
1105
1106   case TGSI_FILE_SYSTEM_VALUE:
1107      /* XXX no swizzling at this point.  Will be needed if we put
1108       * gl_FragCoord, for example, in a sys value register.
1109       */
1110      for (i = 0; i < QUAD_SIZE; i++) {
1111         chan->u[i] = mach->SystemValue[index->i[i]].u[i];
1112      }
1113      break;
1114
1115   case TGSI_FILE_TEMPORARY:
1116      for (i = 0; i < QUAD_SIZE; i++) {
1117         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1118         assert(index2D->i[i] == 0);
1119
1120         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1121      }
1122      break;
1123
1124   case TGSI_FILE_TEMPORARY_ARRAY:
1125      for (i = 0; i < QUAD_SIZE; i++) {
1126         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1127         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1128
1129         chan->u[i] =
1130            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1131      }
1132      break;
1133
1134   case TGSI_FILE_IMMEDIATE:
1135      for (i = 0; i < QUAD_SIZE; i++) {
1136         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1137         assert(index2D->i[i] == 0);
1138
1139         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1140      }
1141      break;
1142
1143   case TGSI_FILE_IMMEDIATE_ARRAY:
1144      for (i = 0; i < QUAD_SIZE; i++) {
1145         assert(index2D->i[i] == 0);
1146
1147         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1148      }
1149      break;
1150
1151   case TGSI_FILE_ADDRESS:
1152      for (i = 0; i < QUAD_SIZE; i++) {
1153         assert(index->i[i] >= 0);
1154         assert(index2D->i[i] == 0);
1155
1156         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1157      }
1158      break;
1159
1160   case TGSI_FILE_PREDICATE:
1161      for (i = 0; i < QUAD_SIZE; i++) {
1162         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1163         assert(index2D->i[i] == 0);
1164
1165         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1166      }
1167      break;
1168
1169   case TGSI_FILE_OUTPUT:
1170      /* vertex/fragment output vars can be read too */
1171      for (i = 0; i < QUAD_SIZE; i++) {
1172         assert(index->i[i] >= 0);
1173         assert(index2D->i[i] == 0);
1174
1175         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1176      }
1177      break;
1178
1179   default:
1180      assert(0);
1181      for (i = 0; i < QUAD_SIZE; i++) {
1182         chan->u[i] = 0;
1183      }
1184   }
1185}
1186
1187static void
1188fetch_source(const struct tgsi_exec_machine *mach,
1189             union tgsi_exec_channel *chan,
1190             const struct tgsi_full_src_register *reg,
1191             const uint chan_index,
1192             enum tgsi_exec_datatype src_datatype)
1193{
1194   union tgsi_exec_channel index;
1195   union tgsi_exec_channel index2D;
1196   uint swizzle;
1197
1198   /* We start with a direct index into a register file.
1199    *
1200    *    file[1],
1201    *    where:
1202    *       file = Register.File
1203    *       [1] = Register.Index
1204    */
1205   index.i[0] =
1206   index.i[1] =
1207   index.i[2] =
1208   index.i[3] = reg->Register.Index;
1209
1210   /* There is an extra source register that indirectly subscripts
1211    * a register file. The direct index now becomes an offset
1212    * that is being added to the indirect register.
1213    *
1214    *    file[ind[2].x+1],
1215    *    where:
1216    *       ind = Indirect.File
1217    *       [2] = Indirect.Index
1218    *       .x = Indirect.SwizzleX
1219    */
1220   if (reg->Register.Indirect) {
1221      union tgsi_exec_channel index2;
1222      union tgsi_exec_channel indir_index;
1223      const uint execmask = mach->ExecMask;
1224      uint i;
1225
1226      /* which address register (always zero now) */
1227      index2.i[0] =
1228      index2.i[1] =
1229      index2.i[2] =
1230      index2.i[3] = reg->Indirect.Index;
1231      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1232      /* get current value of address register[swizzle] */
1233      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1234      fetch_src_file_channel(mach,
1235                             chan_index,
1236                             reg->Indirect.File,
1237                             swizzle,
1238                             &index2,
1239                             &ZeroVec,
1240                             &indir_index);
1241
1242      /* add value of address register to the offset */
1243      index.i[0] += indir_index.i[0];
1244      index.i[1] += indir_index.i[1];
1245      index.i[2] += indir_index.i[2];
1246      index.i[3] += indir_index.i[3];
1247
1248      /* for disabled execution channels, zero-out the index to
1249       * avoid using a potential garbage value.
1250       */
1251      for (i = 0; i < QUAD_SIZE; i++) {
1252         if ((execmask & (1 << i)) == 0)
1253            index.i[i] = 0;
1254      }
1255   }
1256
1257   /* There is an extra source register that is a second
1258    * subscript to a register file. Effectively it means that
1259    * the register file is actually a 2D array of registers.
1260    *
1261    *    file[3][1],
1262    *    where:
1263    *       [3] = Dimension.Index
1264    */
1265   if (reg->Register.Dimension) {
1266      index2D.i[0] =
1267      index2D.i[1] =
1268      index2D.i[2] =
1269      index2D.i[3] = reg->Dimension.Index;
1270
1271      /* Again, the second subscript index can be addressed indirectly
1272       * identically to the first one.
1273       * Nothing stops us from indirectly addressing the indirect register,
1274       * but there is no need for that, so we won't exercise it.
1275       *
1276       *    file[ind[4].y+3][1],
1277       *    where:
1278       *       ind = DimIndirect.File
1279       *       [4] = DimIndirect.Index
1280       *       .y = DimIndirect.SwizzleX
1281       */
1282      if (reg->Dimension.Indirect) {
1283         union tgsi_exec_channel index2;
1284         union tgsi_exec_channel indir_index;
1285         const uint execmask = mach->ExecMask;
1286         uint i;
1287
1288         index2.i[0] =
1289         index2.i[1] =
1290         index2.i[2] =
1291         index2.i[3] = reg->DimIndirect.Index;
1292
1293         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1294         fetch_src_file_channel(mach,
1295                                chan_index,
1296                                reg->DimIndirect.File,
1297                                swizzle,
1298                                &index2,
1299                                &ZeroVec,
1300                                &indir_index);
1301
1302         index2D.i[0] += indir_index.i[0];
1303         index2D.i[1] += indir_index.i[1];
1304         index2D.i[2] += indir_index.i[2];
1305         index2D.i[3] += indir_index.i[3];
1306
1307         /* for disabled execution channels, zero-out the index to
1308          * avoid using a potential garbage value.
1309          */
1310         for (i = 0; i < QUAD_SIZE; i++) {
1311            if ((execmask & (1 << i)) == 0) {
1312               index2D.i[i] = 0;
1313            }
1314         }
1315      }
1316
1317      /* If by any chance there was a need for a 3D array of register
1318       * files, we would have to check whether Dimension is followed
1319       * by a dimension register and continue the saga.
1320       */
1321   } else {
1322      index2D.i[0] =
1323      index2D.i[1] =
1324      index2D.i[2] =
1325      index2D.i[3] = 0;
1326   }
1327
1328   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1329   fetch_src_file_channel(mach,
1330                          chan_index,
1331                          reg->Register.File,
1332                          swizzle,
1333                          &index,
1334                          &index2D,
1335                          chan);
1336
1337   if (reg->Register.Absolute) {
1338      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1339         micro_abs(chan, chan);
1340      } else {
1341         micro_iabs(chan, chan);
1342      }
1343   }
1344
1345   if (reg->Register.Negate) {
1346      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1347         micro_neg(chan, chan);
1348      } else {
1349         micro_ineg(chan, chan);
1350      }
1351   }
1352}
1353
1354static void
1355store_dest(struct tgsi_exec_machine *mach,
1356           const union tgsi_exec_channel *chan,
1357           const struct tgsi_full_dst_register *reg,
1358           const struct tgsi_full_instruction *inst,
1359           uint chan_index,
1360           enum tgsi_exec_datatype dst_datatype)
1361{
1362   uint i;
1363   union tgsi_exec_channel null;
1364   union tgsi_exec_channel *dst;
1365   union tgsi_exec_channel index2D;
1366   uint execmask = mach->ExecMask;
1367   int offset = 0;  /* indirection offset */
1368   int index;
1369
1370   /* for debugging */
1371   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1372      check_inf_or_nan(chan);
1373   }
1374
1375   /* There is an extra source register that indirectly subscripts
1376    * a register file. The direct index now becomes an offset
1377    * that is being added to the indirect register.
1378    *
1379    *    file[ind[2].x+1],
1380    *    where:
1381    *       ind = Indirect.File
1382    *       [2] = Indirect.Index
1383    *       .x = Indirect.SwizzleX
1384    */
1385   if (reg->Register.Indirect) {
1386      union tgsi_exec_channel index;
1387      union tgsi_exec_channel indir_index;
1388      uint swizzle;
1389
1390      /* which address register (always zero for now) */
1391      index.i[0] =
1392      index.i[1] =
1393      index.i[2] =
1394      index.i[3] = reg->Indirect.Index;
1395
1396      /* get current value of address register[swizzle] */
1397      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1398
1399      /* fetch values from the address/indirection register */
1400      fetch_src_file_channel(mach,
1401                             chan_index,
1402                             reg->Indirect.File,
1403                             swizzle,
1404                             &index,
1405                             &ZeroVec,
1406                             &indir_index);
1407
1408      /* save indirection offset */
1409      offset = indir_index.i[0];
1410   }
1411
1412   /* There is an extra source register that is a second
1413    * subscript to a register file. Effectively it means that
1414    * the register file is actually a 2D array of registers.
1415    *
1416    *    file[3][1],
1417    *    where:
1418    *       [3] = Dimension.Index
1419    */
1420   if (reg->Register.Dimension) {
1421      index2D.i[0] =
1422      index2D.i[1] =
1423      index2D.i[2] =
1424      index2D.i[3] = reg->Dimension.Index;
1425
1426      /* Again, the second subscript index can be addressed indirectly
1427       * identically to the first one.
1428       * Nothing stops us from indirectly addressing the indirect register,
1429       * but there is no need for that, so we won't exercise it.
1430       *
1431       *    file[ind[4].y+3][1],
1432       *    where:
1433       *       ind = DimIndirect.File
1434       *       [4] = DimIndirect.Index
1435       *       .y = DimIndirect.SwizzleX
1436       */
1437      if (reg->Dimension.Indirect) {
1438         union tgsi_exec_channel index2;
1439         union tgsi_exec_channel indir_index;
1440         const uint execmask = mach->ExecMask;
1441         unsigned swizzle;
1442         uint i;
1443
1444         index2.i[0] =
1445         index2.i[1] =
1446         index2.i[2] =
1447         index2.i[3] = reg->DimIndirect.Index;
1448
1449         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1450         fetch_src_file_channel(mach,
1451                                chan_index,
1452                                reg->DimIndirect.File,
1453                                swizzle,
1454                                &index2,
1455                                &ZeroVec,
1456                                &indir_index);
1457
1458         index2D.i[0] += indir_index.i[0];
1459         index2D.i[1] += indir_index.i[1];
1460         index2D.i[2] += indir_index.i[2];
1461         index2D.i[3] += indir_index.i[3];
1462
1463         /* for disabled execution channels, zero-out the index to
1464          * avoid using a potential garbage value.
1465          */
1466         for (i = 0; i < QUAD_SIZE; i++) {
1467            if ((execmask & (1 << i)) == 0) {
1468               index2D.i[i] = 0;
1469            }
1470         }
1471      }
1472
1473      /* If by any chance there was a need for a 3D array of register
1474       * files, we would have to check whether Dimension is followed
1475       * by a dimension register and continue the saga.
1476       */
1477   } else {
1478      index2D.i[0] =
1479      index2D.i[1] =
1480      index2D.i[2] =
1481      index2D.i[3] = 0;
1482   }
1483
1484   switch (reg->Register.File) {
1485   case TGSI_FILE_NULL:
1486      dst = &null;
1487      break;
1488
1489   case TGSI_FILE_OUTPUT:
1490      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1491         + reg->Register.Index;
1492      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1493#if 0
1494      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1495         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1496         for (i = 0; i < QUAD_SIZE; i++)
1497            if (execmask & (1 << i))
1498               fprintf(stderr, "%f, ", chan->f[i]);
1499         fprintf(stderr, ")\n");
1500      }
1501#endif
1502      break;
1503
1504   case TGSI_FILE_TEMPORARY:
1505      index = reg->Register.Index;
1506      assert( index < TGSI_EXEC_NUM_TEMPS );
1507      dst = &mach->Temps[offset + index].xyzw[chan_index];
1508      break;
1509
1510   case TGSI_FILE_TEMPORARY_ARRAY:
1511      index = reg->Register.Index;
1512      assert( index < TGSI_EXEC_NUM_TEMPS );
1513      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1514      /* XXX we use index2D.i[0] here but somehow we might
1515       * end up with someone trying to store indirectly in
1516       * different buffers */
1517      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1518      break;
1519
1520   case TGSI_FILE_ADDRESS:
1521      index = reg->Register.Index;
1522      dst = &mach->Addrs[index].xyzw[chan_index];
1523      break;
1524
1525   case TGSI_FILE_PREDICATE:
1526      index = reg->Register.Index;
1527      assert(index < TGSI_EXEC_NUM_PREDS);
1528      dst = &mach->Predicates[index].xyzw[chan_index];
1529      break;
1530
1531   default:
1532      assert( 0 );
1533      return;
1534   }
1535
1536   if (inst->Instruction.Predicate) {
1537      uint swizzle;
1538      union tgsi_exec_channel *pred;
1539
1540      switch (chan_index) {
1541      case CHAN_X:
1542         swizzle = inst->Predicate.SwizzleX;
1543         break;
1544      case CHAN_Y:
1545         swizzle = inst->Predicate.SwizzleY;
1546         break;
1547      case CHAN_Z:
1548         swizzle = inst->Predicate.SwizzleZ;
1549         break;
1550      case CHAN_W:
1551         swizzle = inst->Predicate.SwizzleW;
1552         break;
1553      default:
1554         assert(0);
1555         return;
1556      }
1557
1558      assert(inst->Predicate.Index == 0);
1559
1560      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1561
1562      if (inst->Predicate.Negate) {
1563         for (i = 0; i < QUAD_SIZE; i++) {
1564            if (pred->u[i]) {
1565               execmask &= ~(1 << i);
1566            }
1567         }
1568      } else {
1569         for (i = 0; i < QUAD_SIZE; i++) {
1570            if (!pred->u[i]) {
1571               execmask &= ~(1 << i);
1572            }
1573         }
1574      }
1575   }
1576
1577   switch (inst->Instruction.Saturate) {
1578   case TGSI_SAT_NONE:
1579      for (i = 0; i < QUAD_SIZE; i++)
1580         if (execmask & (1 << i))
1581            dst->i[i] = chan->i[i];
1582      break;
1583
1584   case TGSI_SAT_ZERO_ONE:
1585      for (i = 0; i < QUAD_SIZE; i++)
1586         if (execmask & (1 << i)) {
1587            if (chan->f[i] < 0.0f)
1588               dst->f[i] = 0.0f;
1589            else if (chan->f[i] > 1.0f)
1590               dst->f[i] = 1.0f;
1591            else
1592               dst->i[i] = chan->i[i];
1593         }
1594      break;
1595
1596   case TGSI_SAT_MINUS_PLUS_ONE:
1597      for (i = 0; i < QUAD_SIZE; i++)
1598         if (execmask & (1 << i)) {
1599            if (chan->f[i] < -1.0f)
1600               dst->f[i] = -1.0f;
1601            else if (chan->f[i] > 1.0f)
1602               dst->f[i] = 1.0f;
1603            else
1604               dst->i[i] = chan->i[i];
1605         }
1606      break;
1607
1608   default:
1609      assert( 0 );
1610   }
1611}
1612
1613#define FETCH(VAL,INDEX,CHAN)\
1614    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1615
1616#define IFETCH(VAL,INDEX,CHAN)\
1617    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1618
1619
1620/**
1621 * Execute ARB-style KIL which is predicated by a src register.
1622 * Kill fragment if any of the four values is less than zero.
1623 */
1624static void
1625exec_kil(struct tgsi_exec_machine *mach,
1626         const struct tgsi_full_instruction *inst)
1627{
1628   uint uniquemask;
1629   uint chan_index;
1630   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1631   union tgsi_exec_channel r[1];
1632
1633   /* This mask stores component bits that were already tested. */
1634   uniquemask = 0;
1635
1636   for (chan_index = 0; chan_index < 4; chan_index++)
1637   {
1638      uint swizzle;
1639      uint i;
1640
1641      /* unswizzle channel */
1642      swizzle = tgsi_util_get_full_src_register_swizzle (
1643                        &inst->Src[0],
1644                        chan_index);
1645
1646      /* check if the component has not been already tested */
1647      if (uniquemask & (1 << swizzle))
1648         continue;
1649      uniquemask |= 1 << swizzle;
1650
1651      FETCH(&r[0], 0, chan_index);
1652      for (i = 0; i < 4; i++)
1653         if (r[0].f[i] < 0.0f)
1654            kilmask |= 1 << i;
1655   }
1656
1657   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1658}
1659
1660/**
1661 * Execute NVIDIA-style KIL which is predicated by a condition code.
1662 * Kill fragment if the condition code is TRUE.
1663 */
1664static void
1665exec_kilp(struct tgsi_exec_machine *mach,
1666          const struct tgsi_full_instruction *inst)
1667{
1668   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1669
1670   /* "unconditional" kil */
1671   kilmask = mach->ExecMask;
1672   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1673}
1674
1675static void
1676emit_vertex(struct tgsi_exec_machine *mach)
1677{
1678   /* FIXME: check for exec mask correctly
1679   unsigned i;
1680   for (i = 0; i < QUAD_SIZE; ++i) {
1681         if ((mach->ExecMask & (1 << i)))
1682   */
1683   if (mach->ExecMask) {
1684      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1685      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1686   }
1687}
1688
1689static void
1690emit_primitive(struct tgsi_exec_machine *mach)
1691{
1692   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1693   /* FIXME: check for exec mask correctly
1694   unsigned i;
1695   for (i = 0; i < QUAD_SIZE; ++i) {
1696         if ((mach->ExecMask & (1 << i)))
1697   */
1698   if (mach->ExecMask) {
1699      ++(*prim_count);
1700      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1701      mach->Primitives[*prim_count] = 0;
1702   }
1703}
1704
1705static void
1706conditional_emit_primitive(struct tgsi_exec_machine *mach)
1707{
1708   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1709      int emitted_verts =
1710         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1711      if (emitted_verts) {
1712         emit_primitive(mach);
1713      }
1714   }
1715}
1716
1717
1718/*
1719 * Fetch four texture samples using STR texture coordinates.
1720 */
1721static void
1722fetch_texel( struct tgsi_sampler *sampler,
1723             const union tgsi_exec_channel *s,
1724             const union tgsi_exec_channel *t,
1725             const union tgsi_exec_channel *p,
1726             const union tgsi_exec_channel *c0,
1727             enum tgsi_sampler_control control,
1728             union tgsi_exec_channel *r,
1729             union tgsi_exec_channel *g,
1730             union tgsi_exec_channel *b,
1731             union tgsi_exec_channel *a )
1732{
1733   uint j;
1734   float rgba[NUM_CHANNELS][QUAD_SIZE];
1735
1736   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1737
1738   for (j = 0; j < 4; j++) {
1739      r->f[j] = rgba[0][j];
1740      g->f[j] = rgba[1][j];
1741      b->f[j] = rgba[2][j];
1742      a->f[j] = rgba[3][j];
1743   }
1744}
1745
1746
1747#define TEX_MODIFIER_NONE           0
1748#define TEX_MODIFIER_PROJECTED      1
1749#define TEX_MODIFIER_LOD_BIAS       2
1750#define TEX_MODIFIER_EXPLICIT_LOD   3
1751
1752
1753static void
1754exec_tex(struct tgsi_exec_machine *mach,
1755         const struct tgsi_full_instruction *inst,
1756         uint modifier)
1757{
1758   const uint unit = inst->Src[1].Register.Index;
1759   union tgsi_exec_channel r[4];
1760   const union tgsi_exec_channel *lod = &ZeroVec;
1761   enum tgsi_sampler_control control;
1762   uint chan;
1763
1764   if (modifier != TEX_MODIFIER_NONE) {
1765      FETCH(&r[3], 0, CHAN_W);
1766      if (modifier != TEX_MODIFIER_PROJECTED) {
1767         lod = &r[3];
1768      }
1769   }
1770
1771   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1772      control = tgsi_sampler_lod_explicit;
1773   } else {
1774      control = tgsi_sampler_lod_bias;
1775   }
1776
1777   switch (inst->Texture.Texture) {
1778   case TGSI_TEXTURE_1D:
1779      FETCH(&r[0], 0, CHAN_X);
1780
1781      if (modifier == TEX_MODIFIER_PROJECTED) {
1782         micro_div(&r[0], &r[0], &r[3]);
1783      }
1784
1785      fetch_texel(mach->Samplers[unit],
1786                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1787                  control,
1788                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1789      break;
1790   case TGSI_TEXTURE_SHADOW1D:
1791      FETCH(&r[0], 0, CHAN_X);
1792      FETCH(&r[2], 0, CHAN_Z);
1793
1794      if (modifier == TEX_MODIFIER_PROJECTED) {
1795         micro_div(&r[0], &r[0], &r[3]);
1796      }
1797
1798      fetch_texel(mach->Samplers[unit],
1799                  &r[0], &ZeroVec, &r[2], lod,  /* S, T, P, LOD */
1800                  control,
1801                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1802      break;
1803
1804   case TGSI_TEXTURE_2D:
1805   case TGSI_TEXTURE_RECT:
1806   case TGSI_TEXTURE_SHADOW2D:
1807   case TGSI_TEXTURE_SHADOWRECT:
1808      FETCH(&r[0], 0, CHAN_X);
1809      FETCH(&r[1], 0, CHAN_Y);
1810      FETCH(&r[2], 0, CHAN_Z);
1811
1812      if (modifier == TEX_MODIFIER_PROJECTED) {
1813         micro_div(&r[0], &r[0], &r[3]);
1814         micro_div(&r[1], &r[1], &r[3]);
1815         micro_div(&r[2], &r[2], &r[3]);
1816      }
1817
1818      fetch_texel(mach->Samplers[unit],
1819                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1820                  control,
1821                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1822      break;
1823
1824   case TGSI_TEXTURE_1D_ARRAY:
1825      FETCH(&r[0], 0, CHAN_X);
1826      FETCH(&r[1], 0, CHAN_Y);
1827
1828      if (modifier == TEX_MODIFIER_PROJECTED) {
1829         micro_div(&r[0], &r[0], &r[3]);
1830      }
1831
1832      fetch_texel(mach->Samplers[unit],
1833                  &r[0], &r[1], &ZeroVec, lod,     /* S, T, P, LOD */
1834                  control,
1835                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1836      break;
1837   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1838      FETCH(&r[0], 0, CHAN_X);
1839      FETCH(&r[1], 0, CHAN_Y);
1840      FETCH(&r[2], 0, CHAN_Z);
1841
1842      if (modifier == TEX_MODIFIER_PROJECTED) {
1843         micro_div(&r[0], &r[0], &r[3]);
1844      }
1845
1846      fetch_texel(mach->Samplers[unit],
1847                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1848                  control,
1849                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1850      break;
1851
1852   case TGSI_TEXTURE_2D_ARRAY:
1853      FETCH(&r[0], 0, CHAN_X);
1854      FETCH(&r[1], 0, CHAN_Y);
1855      FETCH(&r[2], 0, CHAN_Z);
1856
1857      if (modifier == TEX_MODIFIER_PROJECTED) {
1858         micro_div(&r[0], &r[0], &r[3]);
1859         micro_div(&r[1], &r[1], &r[3]);
1860      }
1861
1862      fetch_texel(mach->Samplers[unit],
1863                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1864                  control,
1865                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1866      break;
1867   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1868   case TGSI_TEXTURE_SHADOWCUBE:
1869      FETCH(&r[0], 0, CHAN_X);
1870      FETCH(&r[1], 0, CHAN_Y);
1871      FETCH(&r[2], 0, CHAN_Z);
1872      FETCH(&r[3], 0, CHAN_W);
1873
1874      fetch_texel(mach->Samplers[unit],
1875                  &r[0], &r[1], &r[2], &r[3],     /* S, T, P, LOD */
1876                  control,
1877                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1878      break;
1879   case TGSI_TEXTURE_3D:
1880   case TGSI_TEXTURE_CUBE:
1881      FETCH(&r[0], 0, CHAN_X);
1882      FETCH(&r[1], 0, CHAN_Y);
1883      FETCH(&r[2], 0, CHAN_Z);
1884
1885      if (modifier == TEX_MODIFIER_PROJECTED) {
1886         micro_div(&r[0], &r[0], &r[3]);
1887         micro_div(&r[1], &r[1], &r[3]);
1888         micro_div(&r[2], &r[2], &r[3]);
1889      }
1890
1891      fetch_texel(mach->Samplers[unit],
1892                  &r[0], &r[1], &r[2], lod,
1893                  control,
1894                  &r[0], &r[1], &r[2], &r[3]);
1895      break;
1896
1897   default:
1898      assert(0);
1899   }
1900
1901#if 0
1902   debug_printf("fetch r: %g %g %g %g\n",
1903         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
1904   debug_printf("fetch g: %g %g %g %g\n",
1905         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
1906   debug_printf("fetch b: %g %g %g %g\n",
1907         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
1908   debug_printf("fetch a: %g %g %g %g\n",
1909         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
1910#endif
1911
1912   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1913      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1914         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1915      }
1916   }
1917}
1918
1919static void
1920exec_txd(struct tgsi_exec_machine *mach,
1921         const struct tgsi_full_instruction *inst)
1922{
1923   const uint unit = inst->Src[3].Register.Index;
1924   union tgsi_exec_channel r[4];
1925   uint chan;
1926
1927   /*
1928    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1929    */
1930
1931   switch (inst->Texture.Texture) {
1932   case TGSI_TEXTURE_1D:
1933   case TGSI_TEXTURE_SHADOW1D:
1934
1935      FETCH(&r[0], 0, CHAN_X);
1936
1937      fetch_texel(mach->Samplers[unit],
1938                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1939                  tgsi_sampler_lod_bias,
1940                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1941      break;
1942
1943   case TGSI_TEXTURE_1D_ARRAY:
1944   case TGSI_TEXTURE_2D:
1945   case TGSI_TEXTURE_RECT:
1946   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1947   case TGSI_TEXTURE_SHADOW2D:
1948   case TGSI_TEXTURE_SHADOWRECT:
1949
1950      FETCH(&r[0], 0, CHAN_X);
1951      FETCH(&r[1], 0, CHAN_Y);
1952      FETCH(&r[2], 0, CHAN_Z);
1953
1954      fetch_texel(mach->Samplers[unit],
1955                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1956                  tgsi_sampler_lod_bias,
1957                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1958      break;
1959
1960   case TGSI_TEXTURE_2D_ARRAY:
1961   case TGSI_TEXTURE_3D:
1962   case TGSI_TEXTURE_CUBE:
1963
1964      FETCH(&r[0], 0, CHAN_X);
1965      FETCH(&r[1], 0, CHAN_Y);
1966      FETCH(&r[2], 0, CHAN_Z);
1967
1968      fetch_texel(mach->Samplers[unit],
1969                  &r[0], &r[1], &r[2], &ZeroVec,
1970                  tgsi_sampler_lod_bias,
1971                  &r[0], &r[1], &r[2], &r[3]);
1972      break;
1973
1974   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1975
1976      FETCH(&r[0], 0, CHAN_X);
1977      FETCH(&r[1], 0, CHAN_Y);
1978      FETCH(&r[2], 0, CHAN_Z);
1979      FETCH(&r[3], 0, CHAN_W);
1980
1981      fetch_texel(mach->Samplers[unit],
1982                  &r[0], &r[1], &r[2], &r[3],
1983                  tgsi_sampler_lod_bias,
1984                  &r[0], &r[1], &r[2], &r[3]);
1985      break;
1986
1987   default:
1988      assert(0);
1989   }
1990
1991   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1992      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1993         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1994      }
1995   }
1996}
1997
1998
1999static void
2000exec_txf(struct tgsi_exec_machine *mach,
2001	 const struct tgsi_full_instruction *inst)
2002{
2003   struct tgsi_sampler *sampler;
2004   const uint unit = inst->Src[2].Register.Index;
2005   union tgsi_exec_channel r[4];
2006   union tgsi_exec_channel offset[3];
2007   uint chan;
2008   float rgba[NUM_CHANNELS][QUAD_SIZE];
2009   int j;
2010   int8_t offsets[3];
2011
2012   if (inst->Texture.NumOffsets == 1) {
2013      union tgsi_exec_channel index;
2014      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2015      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2016                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2017      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2018                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2019      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2020                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2021     offsets[0] = offset[0].i[0];
2022     offsets[1] = offset[1].i[0];
2023     offsets[2] = offset[2].i[0];
2024   } else
2025     offsets[0] = offsets[1] = offsets[2] = 0;
2026
2027   IFETCH(&r[3], 0, CHAN_W);
2028
2029   switch(inst->Texture.Texture) {
2030   case TGSI_TEXTURE_3D:
2031   case TGSI_TEXTURE_2D_ARRAY:
2032   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2033      IFETCH(&r[2], 0, CHAN_Z);
2034      /* fallthrough */
2035   case TGSI_TEXTURE_2D:
2036   case TGSI_TEXTURE_RECT:
2037   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2038   case TGSI_TEXTURE_SHADOW2D:
2039   case TGSI_TEXTURE_SHADOWRECT:
2040   case TGSI_TEXTURE_1D_ARRAY:
2041      IFETCH(&r[1], 0, CHAN_Y);
2042      /* fallthrough */
2043   case TGSI_TEXTURE_1D:
2044   case TGSI_TEXTURE_SHADOW1D:
2045      IFETCH(&r[0], 0, CHAN_X);
2046      break;
2047   default:
2048      assert(0);
2049      break;
2050   }
2051
2052   sampler = mach->Samplers[unit];
2053   sampler->get_texel(sampler, r[0].i, r[1].i, r[2].i, r[3].i,
2054		      offsets, rgba);
2055
2056   for (j = 0; j < QUAD_SIZE; j++) {
2057      r[0].f[j] = rgba[0][j];
2058      r[1].f[j] = rgba[1][j];
2059      r[2].f[j] = rgba[2][j];
2060      r[3].f[j] = rgba[3][j];
2061   }
2062
2063   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2064      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2065         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2066      }
2067   }
2068}
2069
2070static void
2071exec_txq(struct tgsi_exec_machine *mach,
2072         const struct tgsi_full_instruction *inst)
2073{
2074   struct tgsi_sampler *sampler;
2075   const uint unit = inst->Src[1].Register.Index;
2076   int result[4];
2077   union tgsi_exec_channel r[4], src;
2078   uint chan;
2079   int i,j;
2080
2081   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_INT);
2082   sampler = mach->Samplers[unit];
2083
2084   sampler->get_dims(sampler, src.i[0], result);
2085
2086   for (i = 0; i < QUAD_SIZE; i++) {
2087      for (j = 0; j < 4; j++) {
2088	 r[j].i[i] = result[j];
2089      }
2090   }
2091
2092   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2093      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2094	 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2095		    TGSI_EXEC_DATA_INT);
2096      }
2097   }
2098}
2099
2100static void
2101exec_sample(struct tgsi_exec_machine *mach,
2102            const struct tgsi_full_instruction *inst,
2103            uint modifier)
2104{
2105   const uint resource_unit = inst->Src[1].Register.Index;
2106   const uint sampler_unit = inst->Src[2].Register.Index;
2107   union tgsi_exec_channel r[4];
2108   const union tgsi_exec_channel *lod = &ZeroVec;
2109   enum tgsi_sampler_control control;
2110   uint chan;
2111
2112   if (modifier != TEX_MODIFIER_NONE) {
2113      if (modifier == TEX_MODIFIER_LOD_BIAS)
2114         FETCH(&r[3], 3, CHAN_X);
2115      else /*TEX_MODIFIER_LOD*/
2116         FETCH(&r[3], 0, CHAN_W);
2117
2118      if (modifier != TEX_MODIFIER_PROJECTED) {
2119         lod = &r[3];
2120      }
2121   }
2122
2123   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2124      control = tgsi_sampler_lod_explicit;
2125   } else {
2126      control = tgsi_sampler_lod_bias;
2127   }
2128
2129   switch (mach->Resources[resource_unit].Resource) {
2130   case TGSI_TEXTURE_1D:
2131   case TGSI_TEXTURE_SHADOW1D:
2132      FETCH(&r[0], 0, CHAN_X);
2133
2134      if (modifier == TEX_MODIFIER_PROJECTED) {
2135         micro_div(&r[0], &r[0], &r[3]);
2136      }
2137
2138      fetch_texel(mach->Samplers[sampler_unit],
2139                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
2140                  control,
2141                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2142      break;
2143
2144   case TGSI_TEXTURE_1D_ARRAY:
2145   case TGSI_TEXTURE_2D:
2146   case TGSI_TEXTURE_RECT:
2147   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2148   case TGSI_TEXTURE_SHADOW2D:
2149   case TGSI_TEXTURE_SHADOWRECT:
2150      FETCH(&r[0], 0, CHAN_X);
2151      FETCH(&r[1], 0, CHAN_Y);
2152      FETCH(&r[2], 0, CHAN_Z);
2153
2154      if (modifier == TEX_MODIFIER_PROJECTED) {
2155         micro_div(&r[0], &r[0], &r[3]);
2156         micro_div(&r[1], &r[1], &r[3]);
2157         micro_div(&r[2], &r[2], &r[3]);
2158      }
2159
2160      fetch_texel(mach->Samplers[sampler_unit],
2161                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
2162                  control,
2163                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2164      break;
2165
2166   case TGSI_TEXTURE_2D_ARRAY:
2167   case TGSI_TEXTURE_3D:
2168   case TGSI_TEXTURE_CUBE:
2169      FETCH(&r[0], 0, CHAN_X);
2170      FETCH(&r[1], 0, CHAN_Y);
2171      FETCH(&r[2], 0, CHAN_Z);
2172
2173      if (modifier == TEX_MODIFIER_PROJECTED) {
2174         micro_div(&r[0], &r[0], &r[3]);
2175         micro_div(&r[1], &r[1], &r[3]);
2176         micro_div(&r[2], &r[2], &r[3]);
2177      }
2178
2179      fetch_texel(mach->Samplers[sampler_unit],
2180                  &r[0], &r[1], &r[2], lod,
2181                  control,
2182                  &r[0], &r[1], &r[2], &r[3]);
2183      break;
2184
2185   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2186   case TGSI_TEXTURE_SHADOWCUBE:
2187      FETCH(&r[0], 0, CHAN_X);
2188      FETCH(&r[1], 0, CHAN_Y);
2189      FETCH(&r[2], 0, CHAN_Z);
2190      FETCH(&r[3], 0, CHAN_W);
2191
2192      assert(modifier != TEX_MODIFIER_PROJECTED);
2193
2194      fetch_texel(mach->Samplers[sampler_unit],
2195                  &r[0], &r[1], &r[2], &r[3],
2196                  control,
2197                  &r[0], &r[1], &r[2], &r[3]);
2198      break;
2199
2200   default:
2201      assert(0);
2202   }
2203
2204   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2205      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2206         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2207      }
2208   }
2209}
2210
2211static void
2212exec_sample_d(struct tgsi_exec_machine *mach,
2213              const struct tgsi_full_instruction *inst)
2214{
2215   const uint resource_unit = inst->Src[1].Register.Index;
2216   const uint sampler_unit = inst->Src[2].Register.Index;
2217   union tgsi_exec_channel r[4];
2218   uint chan;
2219   /*
2220    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
2221    */
2222
2223   switch (mach->Resources[resource_unit].Resource) {
2224   case TGSI_TEXTURE_1D:
2225   case TGSI_TEXTURE_SHADOW1D:
2226
2227      FETCH(&r[0], 0, CHAN_X);
2228
2229      fetch_texel(mach->Samplers[sampler_unit],
2230                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
2231                  tgsi_sampler_lod_bias,
2232                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2233      break;
2234
2235   case TGSI_TEXTURE_2D:
2236   case TGSI_TEXTURE_RECT:
2237   case TGSI_TEXTURE_SHADOW2D:
2238   case TGSI_TEXTURE_SHADOWRECT:
2239
2240      FETCH(&r[0], 0, CHAN_X);
2241      FETCH(&r[1], 0, CHAN_Y);
2242      FETCH(&r[2], 0, CHAN_Z);
2243
2244      fetch_texel(mach->Samplers[sampler_unit],
2245                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2246                  tgsi_sampler_lod_bias,
2247                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2248      break;
2249
2250   case TGSI_TEXTURE_3D:
2251   case TGSI_TEXTURE_CUBE:
2252
2253      FETCH(&r[0], 0, CHAN_X);
2254      FETCH(&r[1], 0, CHAN_Y);
2255      FETCH(&r[2], 0, CHAN_Z);
2256
2257      fetch_texel(mach->Samplers[sampler_unit],
2258                  &r[0], &r[1], &r[2], &ZeroVec,
2259                  tgsi_sampler_lod_bias,
2260                  &r[0], &r[1], &r[2], &r[3]);
2261      break;
2262
2263   default:
2264      assert(0);
2265   }
2266
2267   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2268      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2269         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2270      }
2271   }
2272}
2273
2274
2275/**
2276 * Evaluate a constant-valued coefficient at the position of the
2277 * current quad.
2278 */
2279static void
2280eval_constant_coef(
2281   struct tgsi_exec_machine *mach,
2282   unsigned attrib,
2283   unsigned chan )
2284{
2285   unsigned i;
2286
2287   for( i = 0; i < QUAD_SIZE; i++ ) {
2288      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2289   }
2290}
2291
2292/**
2293 * Evaluate a linear-valued coefficient at the position of the
2294 * current quad.
2295 */
2296static void
2297eval_linear_coef(
2298   struct tgsi_exec_machine *mach,
2299   unsigned attrib,
2300   unsigned chan )
2301{
2302   const float x = mach->QuadPos.xyzw[0].f[0];
2303   const float y = mach->QuadPos.xyzw[1].f[0];
2304   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2305   const float dady = mach->InterpCoefs[attrib].dady[chan];
2306   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2307   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2308   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2309   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2310   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2311}
2312
2313/**
2314 * Evaluate a perspective-valued coefficient at the position of the
2315 * current quad.
2316 */
2317static void
2318eval_perspective_coef(
2319   struct tgsi_exec_machine *mach,
2320   unsigned attrib,
2321   unsigned chan )
2322{
2323   const float x = mach->QuadPos.xyzw[0].f[0];
2324   const float y = mach->QuadPos.xyzw[1].f[0];
2325   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2326   const float dady = mach->InterpCoefs[attrib].dady[chan];
2327   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2328   const float *w = mach->QuadPos.xyzw[3].f;
2329   /* divide by W here */
2330   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2331   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2332   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2333   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2334}
2335
2336
2337typedef void (* eval_coef_func)(
2338   struct tgsi_exec_machine *mach,
2339   unsigned attrib,
2340   unsigned chan );
2341
2342static void
2343exec_declaration(struct tgsi_exec_machine *mach,
2344                 const struct tgsi_full_declaration *decl)
2345{
2346   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
2347      mach->Resources[decl->Range.First] = decl->Resource;
2348      return;
2349   }
2350
2351   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2352      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2353         uint first, last, mask;
2354
2355         first = decl->Range.First;
2356         last = decl->Range.Last;
2357         mask = decl->Declaration.UsageMask;
2358
2359         /* XXX we could remove this special-case code since
2360          * mach->InterpCoefs[first].a0 should already have the
2361          * front/back-face value.  But we should first update the
2362          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2363          * Then, we could remove the tgsi_exec_machine::Face field.
2364          */
2365         /* XXX make FACE a system value */
2366         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2367            uint i;
2368
2369            assert(decl->Semantic.Index == 0);
2370            assert(first == last);
2371
2372            for (i = 0; i < QUAD_SIZE; i++) {
2373               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2374            }
2375         } else {
2376            eval_coef_func eval;
2377            uint i, j;
2378
2379            switch (decl->Declaration.Interpolate) {
2380            case TGSI_INTERPOLATE_CONSTANT:
2381               eval = eval_constant_coef;
2382               break;
2383
2384            case TGSI_INTERPOLATE_LINEAR:
2385               eval = eval_linear_coef;
2386               break;
2387
2388            case TGSI_INTERPOLATE_PERSPECTIVE:
2389               eval = eval_perspective_coef;
2390               break;
2391
2392            case TGSI_INTERPOLATE_COLOR:
2393               eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
2394               break;
2395
2396            default:
2397               assert(0);
2398               return;
2399            }
2400
2401            for (j = 0; j < NUM_CHANNELS; j++) {
2402               if (mask & (1 << j)) {
2403                  for (i = first; i <= last; i++) {
2404                     eval(mach, i, j);
2405                  }
2406               }
2407            }
2408         }
2409      }
2410   }
2411
2412   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2413      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2414   }
2415}
2416
2417
2418typedef void (* micro_op)(union tgsi_exec_channel *dst);
2419
2420static void
2421exec_vector(struct tgsi_exec_machine *mach,
2422            const struct tgsi_full_instruction *inst,
2423            micro_op op,
2424            enum tgsi_exec_datatype dst_datatype)
2425{
2426   unsigned int chan;
2427
2428   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2429      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2430         union tgsi_exec_channel dst;
2431
2432         op(&dst);
2433         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2434      }
2435   }
2436}
2437
2438typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2439                                const union tgsi_exec_channel *src);
2440
2441static void
2442exec_scalar_unary(struct tgsi_exec_machine *mach,
2443                  const struct tgsi_full_instruction *inst,
2444                  micro_unary_op op,
2445                  enum tgsi_exec_datatype dst_datatype,
2446                  enum tgsi_exec_datatype src_datatype)
2447{
2448   unsigned int chan;
2449   union tgsi_exec_channel src;
2450   union tgsi_exec_channel dst;
2451
2452   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
2453   op(&dst, &src);
2454   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2455      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2456         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2457      }
2458   }
2459}
2460
2461static void
2462exec_vector_unary(struct tgsi_exec_machine *mach,
2463                  const struct tgsi_full_instruction *inst,
2464                  micro_unary_op op,
2465                  enum tgsi_exec_datatype dst_datatype,
2466                  enum tgsi_exec_datatype src_datatype)
2467{
2468   unsigned int chan;
2469   struct tgsi_exec_vector dst;
2470
2471   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2472      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2473         union tgsi_exec_channel src;
2474
2475         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2476         op(&dst.xyzw[chan], &src);
2477      }
2478   }
2479   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2480      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2481         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2482      }
2483   }
2484}
2485
2486typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2487                                 const union tgsi_exec_channel *src0,
2488                                 const union tgsi_exec_channel *src1);
2489
2490static void
2491exec_scalar_binary(struct tgsi_exec_machine *mach,
2492                   const struct tgsi_full_instruction *inst,
2493                   micro_binary_op op,
2494                   enum tgsi_exec_datatype dst_datatype,
2495                   enum tgsi_exec_datatype src_datatype)
2496{
2497   unsigned int chan;
2498   union tgsi_exec_channel src[2];
2499   union tgsi_exec_channel dst;
2500
2501   fetch_source(mach, &src[0], &inst->Src[0], CHAN_X, src_datatype);
2502   fetch_source(mach, &src[1], &inst->Src[1], CHAN_Y, src_datatype);
2503   op(&dst, &src[0], &src[1]);
2504   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2505      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2506         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2507      }
2508   }
2509}
2510
2511static void
2512exec_vector_binary(struct tgsi_exec_machine *mach,
2513                   const struct tgsi_full_instruction *inst,
2514                   micro_binary_op op,
2515                   enum tgsi_exec_datatype dst_datatype,
2516                   enum tgsi_exec_datatype src_datatype)
2517{
2518   unsigned int chan;
2519   struct tgsi_exec_vector dst;
2520
2521   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2522      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2523         union tgsi_exec_channel src[2];
2524
2525         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2526         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2527         op(&dst.xyzw[chan], &src[0], &src[1]);
2528      }
2529   }
2530   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2531      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2532         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2533      }
2534   }
2535}
2536
2537typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2538                                  const union tgsi_exec_channel *src0,
2539                                  const union tgsi_exec_channel *src1,
2540                                  const union tgsi_exec_channel *src2);
2541
2542static void
2543exec_vector_trinary(struct tgsi_exec_machine *mach,
2544                    const struct tgsi_full_instruction *inst,
2545                    micro_trinary_op op,
2546                    enum tgsi_exec_datatype dst_datatype,
2547                    enum tgsi_exec_datatype src_datatype)
2548{
2549   unsigned int chan;
2550   struct tgsi_exec_vector dst;
2551
2552   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2553      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2554         union tgsi_exec_channel src[3];
2555
2556         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2557         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2558         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2559         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2560      }
2561   }
2562   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2563      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2564         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2565      }
2566   }
2567}
2568
2569static void
2570exec_dp3(struct tgsi_exec_machine *mach,
2571         const struct tgsi_full_instruction *inst)
2572{
2573   unsigned int chan;
2574   union tgsi_exec_channel arg[3];
2575
2576   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2577   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2578   micro_mul(&arg[2], &arg[0], &arg[1]);
2579
2580   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2581      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2582      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2583      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2584   }
2585
2586   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2587      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2588         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2589      }
2590   }
2591}
2592
2593static void
2594exec_dp4(struct tgsi_exec_machine *mach,
2595         const struct tgsi_full_instruction *inst)
2596{
2597   unsigned int chan;
2598   union tgsi_exec_channel arg[3];
2599
2600   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2601   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2602   micro_mul(&arg[2], &arg[0], &arg[1]);
2603
2604   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2605      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2606      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2607      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2608   }
2609
2610   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2611      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2612         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2613      }
2614   }
2615}
2616
2617static void
2618exec_dp2a(struct tgsi_exec_machine *mach,
2619          const struct tgsi_full_instruction *inst)
2620{
2621   unsigned int chan;
2622   union tgsi_exec_channel arg[3];
2623
2624   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2625   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2626   micro_mul(&arg[2], &arg[0], &arg[1]);
2627
2628   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2629   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2630   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2631
2632   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2633   micro_add(&arg[0], &arg[0], &arg[1]);
2634
2635   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2636      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2637         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2638      }
2639   }
2640}
2641
2642static void
2643exec_dph(struct tgsi_exec_machine *mach,
2644         const struct tgsi_full_instruction *inst)
2645{
2646   unsigned int chan;
2647   union tgsi_exec_channel arg[3];
2648
2649   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2650   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2651   micro_mul(&arg[2], &arg[0], &arg[1]);
2652
2653   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2654   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2655   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2656
2657   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2658   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2659   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2660
2661   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2662   micro_add(&arg[0], &arg[0], &arg[1]);
2663
2664   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2665      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2666         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2667      }
2668   }
2669}
2670
2671static void
2672exec_dp2(struct tgsi_exec_machine *mach,
2673         const struct tgsi_full_instruction *inst)
2674{
2675   unsigned int chan;
2676   union tgsi_exec_channel arg[3];
2677
2678   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2679   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2680   micro_mul(&arg[2], &arg[0], &arg[1]);
2681
2682   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2683   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2684   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2685
2686   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2687      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2688         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2689      }
2690   }
2691}
2692
2693static void
2694exec_nrm4(struct tgsi_exec_machine *mach,
2695          const struct tgsi_full_instruction *inst)
2696{
2697   unsigned int chan;
2698   union tgsi_exec_channel arg[4];
2699   union tgsi_exec_channel scale;
2700
2701   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2702   micro_mul(&scale, &arg[0], &arg[0]);
2703
2704   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2705      union tgsi_exec_channel product;
2706
2707      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2708      micro_mul(&product, &arg[chan], &arg[chan]);
2709      micro_add(&scale, &scale, &product);
2710   }
2711
2712   micro_rsq(&scale, &scale);
2713
2714   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2715      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2716         micro_mul(&arg[chan], &arg[chan], &scale);
2717         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2718      }
2719   }
2720}
2721
2722static void
2723exec_nrm3(struct tgsi_exec_machine *mach,
2724          const struct tgsi_full_instruction *inst)
2725{
2726   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2727      unsigned int chan;
2728      union tgsi_exec_channel arg[3];
2729      union tgsi_exec_channel scale;
2730
2731      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2732      micro_mul(&scale, &arg[0], &arg[0]);
2733
2734      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2735         union tgsi_exec_channel product;
2736
2737         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2738         micro_mul(&product, &arg[chan], &arg[chan]);
2739         micro_add(&scale, &scale, &product);
2740      }
2741
2742      micro_rsq(&scale, &scale);
2743
2744      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2745         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2746            micro_mul(&arg[chan], &arg[chan], &scale);
2747            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2748         }
2749      }
2750   }
2751
2752   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2753      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2754   }
2755}
2756
2757static void
2758exec_scs(struct tgsi_exec_machine *mach,
2759         const struct tgsi_full_instruction *inst)
2760{
2761   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2762      union tgsi_exec_channel arg;
2763      union tgsi_exec_channel result;
2764
2765      fetch_source(mach, &arg, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2766
2767      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2768         micro_cos(&result, &arg);
2769         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2770      }
2771      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2772         micro_sin(&result, &arg);
2773         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2774      }
2775   }
2776   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2777      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2778   }
2779   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2780      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2781   }
2782}
2783
2784static void
2785exec_x2d(struct tgsi_exec_machine *mach,
2786         const struct tgsi_full_instruction *inst)
2787{
2788   union tgsi_exec_channel r[4];
2789   union tgsi_exec_channel d[2];
2790
2791   fetch_source(mach, &r[0], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2792   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2793   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2794      fetch_source(mach, &r[2], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2795      micro_mul(&r[2], &r[2], &r[0]);
2796      fetch_source(mach, &r[3], &inst->Src[2], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2797      micro_mul(&r[3], &r[3], &r[1]);
2798      micro_add(&r[2], &r[2], &r[3]);
2799      fetch_source(mach, &r[3], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2800      micro_add(&d[0], &r[2], &r[3]);
2801   }
2802   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2803      fetch_source(mach, &r[2], &inst->Src[2], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2804      micro_mul(&r[2], &r[2], &r[0]);
2805      fetch_source(mach, &r[3], &inst->Src[2], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2806      micro_mul(&r[3], &r[3], &r[1]);
2807      micro_add(&r[2], &r[2], &r[3]);
2808      fetch_source(mach, &r[3], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2809      micro_add(&d[1], &r[2], &r[3]);
2810   }
2811   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2812      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2813   }
2814   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2815      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2816   }
2817   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2818      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2819   }
2820   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2821      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2822   }
2823}
2824
2825static void
2826exec_rfl(struct tgsi_exec_machine *mach,
2827         const struct tgsi_full_instruction *inst)
2828{
2829   union tgsi_exec_channel r[9];
2830
2831   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2832      /* r0 = dp3(src0, src0) */
2833      fetch_source(mach, &r[2], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2834      micro_mul(&r[0], &r[2], &r[2]);
2835      fetch_source(mach, &r[4], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2836      micro_mul(&r[8], &r[4], &r[4]);
2837      micro_add(&r[0], &r[0], &r[8]);
2838      fetch_source(mach, &r[6], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2839      micro_mul(&r[8], &r[6], &r[6]);
2840      micro_add(&r[0], &r[0], &r[8]);
2841
2842      /* r1 = dp3(src0, src1) */
2843      fetch_source(mach, &r[3], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2844      micro_mul(&r[1], &r[2], &r[3]);
2845      fetch_source(mach, &r[5], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2846      micro_mul(&r[8], &r[4], &r[5]);
2847      micro_add(&r[1], &r[1], &r[8]);
2848      fetch_source(mach, &r[7], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2849      micro_mul(&r[8], &r[6], &r[7]);
2850      micro_add(&r[1], &r[1], &r[8]);
2851
2852      /* r1 = 2 * r1 / r0 */
2853      micro_add(&r[1], &r[1], &r[1]);
2854      micro_div(&r[1], &r[1], &r[0]);
2855
2856      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2857         micro_mul(&r[2], &r[2], &r[1]);
2858         micro_sub(&r[2], &r[2], &r[3]);
2859         store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2860      }
2861      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2862         micro_mul(&r[4], &r[4], &r[1]);
2863         micro_sub(&r[4], &r[4], &r[5]);
2864         store_dest(mach, &r[4], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2865      }
2866      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2867         micro_mul(&r[6], &r[6], &r[1]);
2868         micro_sub(&r[6], &r[6], &r[7]);
2869         store_dest(mach, &r[6], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2870      }
2871   }
2872   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2873      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2874   }
2875}
2876
2877static void
2878exec_xpd(struct tgsi_exec_machine *mach,
2879         const struct tgsi_full_instruction *inst)
2880{
2881   union tgsi_exec_channel r[6];
2882   union tgsi_exec_channel d[3];
2883
2884   fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2885   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2886
2887   micro_mul(&r[2], &r[0], &r[1]);
2888
2889   fetch_source(mach, &r[3], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2890   fetch_source(mach, &r[4], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2891
2892   micro_mul(&r[5], &r[3], &r[4] );
2893   micro_sub(&d[CHAN_X], &r[2], &r[5]);
2894
2895   fetch_source(mach, &r[2], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2896
2897   micro_mul(&r[3], &r[3], &r[2]);
2898
2899   fetch_source(mach, &r[5], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2900
2901   micro_mul(&r[1], &r[1], &r[5]);
2902   micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2903
2904   micro_mul(&r[5], &r[5], &r[4]);
2905   micro_mul(&r[0], &r[0], &r[2]);
2906   micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2907
2908   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2909      store_dest(mach, &d[CHAN_X], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2910   }
2911   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2912      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2913   }
2914   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2915      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2916   }
2917   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2918      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2919   }
2920}
2921
2922static void
2923exec_dst(struct tgsi_exec_machine *mach,
2924         const struct tgsi_full_instruction *inst)
2925{
2926   union tgsi_exec_channel r[2];
2927   union tgsi_exec_channel d[4];
2928
2929   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2930      fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2931      fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2932      micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2933   }
2934   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2935      fetch_source(mach, &d[CHAN_Z], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2936   }
2937   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2938      fetch_source(mach, &d[CHAN_W], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2939   }
2940
2941   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2942      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2943   }
2944   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2945      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2946   }
2947   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2948      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2949   }
2950   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2951      store_dest(mach, &d[CHAN_W], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2952   }
2953}
2954
2955static void
2956exec_log(struct tgsi_exec_machine *mach,
2957         const struct tgsi_full_instruction *inst)
2958{
2959   union tgsi_exec_channel r[3];
2960
2961   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2962   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2963   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2964   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2965   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2966      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2967   }
2968   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2969      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2970      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2971      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2972   }
2973   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2974      store_dest(mach, &r[1], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2975   }
2976   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2977      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2978   }
2979}
2980
2981static void
2982exec_exp(struct tgsi_exec_machine *mach,
2983         const struct tgsi_full_instruction *inst)
2984{
2985   union tgsi_exec_channel r[3];
2986
2987   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2988   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2989   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2990      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2991      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2992   }
2993   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2994      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2995      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2996   }
2997   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2998      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2999      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3000   }
3001   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3002      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
3003   }
3004}
3005
3006static void
3007exec_lit(struct tgsi_exec_machine *mach,
3008         const struct tgsi_full_instruction *inst)
3009{
3010   union tgsi_exec_channel r[3];
3011   union tgsi_exec_channel d[3];
3012
3013   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3014      fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
3015      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3016         fetch_source(mach, &r[1], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3017         micro_max(&r[1], &r[1], &ZeroVec);
3018
3019         fetch_source(mach, &r[2], &inst->Src[0], CHAN_W, TGSI_EXEC_DATA_FLOAT);
3020         micro_min(&r[2], &r[2], &P128Vec);
3021         micro_max(&r[2], &r[2], &M128Vec);
3022         micro_pow(&r[1], &r[1], &r[2]);
3023         micro_lt(&d[CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3024         store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3025      }
3026      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3027         micro_max(&d[CHAN_Y], &r[0], &ZeroVec);
3028         store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3029      }
3030   }
3031   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3032      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
3033   }
3034
3035   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3036      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
3037   }
3038}
3039
3040static void
3041exec_break(struct tgsi_exec_machine *mach)
3042{
3043   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3044      /* turn off loop channels for each enabled exec channel */
3045      mach->LoopMask &= ~mach->ExecMask;
3046      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3047      UPDATE_EXEC_MASK(mach);
3048   } else {
3049      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3050
3051      mach->Switch.mask = 0x0;
3052
3053      UPDATE_EXEC_MASK(mach);
3054   }
3055}
3056
3057static void
3058exec_switch(struct tgsi_exec_machine *mach,
3059            const struct tgsi_full_instruction *inst)
3060{
3061   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3062   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3063
3064   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3065   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
3066   mach->Switch.mask = 0x0;
3067   mach->Switch.defaultMask = 0x0;
3068
3069   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3070   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3071
3072   UPDATE_EXEC_MASK(mach);
3073}
3074
3075static void
3076exec_case(struct tgsi_exec_machine *mach,
3077          const struct tgsi_full_instruction *inst)
3078{
3079   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3080   union tgsi_exec_channel src;
3081   uint mask = 0;
3082
3083   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
3084
3085   if (mach->Switch.selector.u[0] == src.u[0]) {
3086      mask |= 0x1;
3087   }
3088   if (mach->Switch.selector.u[1] == src.u[1]) {
3089      mask |= 0x2;
3090   }
3091   if (mach->Switch.selector.u[2] == src.u[2]) {
3092      mask |= 0x4;
3093   }
3094   if (mach->Switch.selector.u[3] == src.u[3]) {
3095      mask |= 0x8;
3096   }
3097
3098   mach->Switch.defaultMask |= mask;
3099
3100   mach->Switch.mask |= mask & prevMask;
3101
3102   UPDATE_EXEC_MASK(mach);
3103}
3104
3105static void
3106exec_default(struct tgsi_exec_machine *mach)
3107{
3108   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3109
3110   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3111
3112   UPDATE_EXEC_MASK(mach);
3113}
3114
3115static void
3116exec_endswitch(struct tgsi_exec_machine *mach)
3117{
3118   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3119   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3120
3121   UPDATE_EXEC_MASK(mach);
3122}
3123
3124static void
3125micro_i2f(union tgsi_exec_channel *dst,
3126          const union tgsi_exec_channel *src)
3127{
3128   dst->f[0] = (float)src->i[0];
3129   dst->f[1] = (float)src->i[1];
3130   dst->f[2] = (float)src->i[2];
3131   dst->f[3] = (float)src->i[3];
3132}
3133
3134static void
3135micro_not(union tgsi_exec_channel *dst,
3136          const union tgsi_exec_channel *src)
3137{
3138   dst->u[0] = ~src->u[0];
3139   dst->u[1] = ~src->u[1];
3140   dst->u[2] = ~src->u[2];
3141   dst->u[3] = ~src->u[3];
3142}
3143
3144static void
3145micro_shl(union tgsi_exec_channel *dst,
3146          const union tgsi_exec_channel *src0,
3147          const union tgsi_exec_channel *src1)
3148{
3149   dst->u[0] = src0->u[0] << src1->u[0];
3150   dst->u[1] = src0->u[1] << src1->u[1];
3151   dst->u[2] = src0->u[2] << src1->u[2];
3152   dst->u[3] = src0->u[3] << src1->u[3];
3153}
3154
3155static void
3156micro_and(union tgsi_exec_channel *dst,
3157          const union tgsi_exec_channel *src0,
3158          const union tgsi_exec_channel *src1)
3159{
3160   dst->u[0] = src0->u[0] & src1->u[0];
3161   dst->u[1] = src0->u[1] & src1->u[1];
3162   dst->u[2] = src0->u[2] & src1->u[2];
3163   dst->u[3] = src0->u[3] & src1->u[3];
3164}
3165
3166static void
3167micro_or(union tgsi_exec_channel *dst,
3168         const union tgsi_exec_channel *src0,
3169         const union tgsi_exec_channel *src1)
3170{
3171   dst->u[0] = src0->u[0] | src1->u[0];
3172   dst->u[1] = src0->u[1] | src1->u[1];
3173   dst->u[2] = src0->u[2] | src1->u[2];
3174   dst->u[3] = src0->u[3] | src1->u[3];
3175}
3176
3177static void
3178micro_xor(union tgsi_exec_channel *dst,
3179          const union tgsi_exec_channel *src0,
3180          const union tgsi_exec_channel *src1)
3181{
3182   dst->u[0] = src0->u[0] ^ src1->u[0];
3183   dst->u[1] = src0->u[1] ^ src1->u[1];
3184   dst->u[2] = src0->u[2] ^ src1->u[2];
3185   dst->u[3] = src0->u[3] ^ src1->u[3];
3186}
3187
3188static void
3189micro_mod(union tgsi_exec_channel *dst,
3190          const union tgsi_exec_channel *src0,
3191          const union tgsi_exec_channel *src1)
3192{
3193   dst->i[0] = src0->i[0] % src1->i[0];
3194   dst->i[1] = src0->i[1] % src1->i[1];
3195   dst->i[2] = src0->i[2] % src1->i[2];
3196   dst->i[3] = src0->i[3] % src1->i[3];
3197}
3198
3199static void
3200micro_f2i(union tgsi_exec_channel *dst,
3201          const union tgsi_exec_channel *src)
3202{
3203   dst->i[0] = (int)src->f[0];
3204   dst->i[1] = (int)src->f[1];
3205   dst->i[2] = (int)src->f[2];
3206   dst->i[3] = (int)src->f[3];
3207}
3208
3209static void
3210micro_idiv(union tgsi_exec_channel *dst,
3211           const union tgsi_exec_channel *src0,
3212           const union tgsi_exec_channel *src1)
3213{
3214   dst->i[0] = src0->i[0] / src1->i[0];
3215   dst->i[1] = src0->i[1] / src1->i[1];
3216   dst->i[2] = src0->i[2] / src1->i[2];
3217   dst->i[3] = src0->i[3] / src1->i[3];
3218}
3219
3220static void
3221micro_imax(union tgsi_exec_channel *dst,
3222           const union tgsi_exec_channel *src0,
3223           const union tgsi_exec_channel *src1)
3224{
3225   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
3226   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
3227   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
3228   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
3229}
3230
3231static void
3232micro_imin(union tgsi_exec_channel *dst,
3233           const union tgsi_exec_channel *src0,
3234           const union tgsi_exec_channel *src1)
3235{
3236   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
3237   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
3238   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
3239   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
3240}
3241
3242static void
3243micro_isge(union tgsi_exec_channel *dst,
3244           const union tgsi_exec_channel *src0,
3245           const union tgsi_exec_channel *src1)
3246{
3247   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
3248   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
3249   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
3250   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
3251}
3252
3253static void
3254micro_ishr(union tgsi_exec_channel *dst,
3255           const union tgsi_exec_channel *src0,
3256           const union tgsi_exec_channel *src1)
3257{
3258   dst->i[0] = src0->i[0] >> src1->i[0];
3259   dst->i[1] = src0->i[1] >> src1->i[1];
3260   dst->i[2] = src0->i[2] >> src1->i[2];
3261   dst->i[3] = src0->i[3] >> src1->i[3];
3262}
3263
3264static void
3265micro_islt(union tgsi_exec_channel *dst,
3266           const union tgsi_exec_channel *src0,
3267           const union tgsi_exec_channel *src1)
3268{
3269   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3270   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3271   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3272   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3273}
3274
3275static void
3276micro_f2u(union tgsi_exec_channel *dst,
3277          const union tgsi_exec_channel *src)
3278{
3279   dst->u[0] = (uint)src->f[0];
3280   dst->u[1] = (uint)src->f[1];
3281   dst->u[2] = (uint)src->f[2];
3282   dst->u[3] = (uint)src->f[3];
3283}
3284
3285static void
3286micro_u2f(union tgsi_exec_channel *dst,
3287          const union tgsi_exec_channel *src)
3288{
3289   dst->f[0] = (float)src->u[0];
3290   dst->f[1] = (float)src->u[1];
3291   dst->f[2] = (float)src->u[2];
3292   dst->f[3] = (float)src->u[3];
3293}
3294
3295static void
3296micro_uadd(union tgsi_exec_channel *dst,
3297           const union tgsi_exec_channel *src0,
3298           const union tgsi_exec_channel *src1)
3299{
3300   dst->u[0] = src0->u[0] + src1->u[0];
3301   dst->u[1] = src0->u[1] + src1->u[1];
3302   dst->u[2] = src0->u[2] + src1->u[2];
3303   dst->u[3] = src0->u[3] + src1->u[3];
3304}
3305
3306static void
3307micro_udiv(union tgsi_exec_channel *dst,
3308           const union tgsi_exec_channel *src0,
3309           const union tgsi_exec_channel *src1)
3310{
3311   dst->u[0] = src0->u[0] / src1->u[0];
3312   dst->u[1] = src0->u[1] / src1->u[1];
3313   dst->u[2] = src0->u[2] / src1->u[2];
3314   dst->u[3] = src0->u[3] / src1->u[3];
3315}
3316
3317static void
3318micro_umad(union tgsi_exec_channel *dst,
3319           const union tgsi_exec_channel *src0,
3320           const union tgsi_exec_channel *src1,
3321           const union tgsi_exec_channel *src2)
3322{
3323   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3324   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3325   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3326   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3327}
3328
3329static void
3330micro_umax(union tgsi_exec_channel *dst,
3331           const union tgsi_exec_channel *src0,
3332           const union tgsi_exec_channel *src1)
3333{
3334   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3335   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3336   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3337   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3338}
3339
3340static void
3341micro_umin(union tgsi_exec_channel *dst,
3342           const union tgsi_exec_channel *src0,
3343           const union tgsi_exec_channel *src1)
3344{
3345   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3346   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3347   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3348   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3349}
3350
3351static void
3352micro_umod(union tgsi_exec_channel *dst,
3353           const union tgsi_exec_channel *src0,
3354           const union tgsi_exec_channel *src1)
3355{
3356   dst->u[0] = src0->u[0] % src1->u[0];
3357   dst->u[1] = src0->u[1] % src1->u[1];
3358   dst->u[2] = src0->u[2] % src1->u[2];
3359   dst->u[3] = src0->u[3] % src1->u[3];
3360}
3361
3362static void
3363micro_umul(union tgsi_exec_channel *dst,
3364           const union tgsi_exec_channel *src0,
3365           const union tgsi_exec_channel *src1)
3366{
3367   dst->u[0] = src0->u[0] * src1->u[0];
3368   dst->u[1] = src0->u[1] * src1->u[1];
3369   dst->u[2] = src0->u[2] * src1->u[2];
3370   dst->u[3] = src0->u[3] * src1->u[3];
3371}
3372
3373static void
3374micro_useq(union tgsi_exec_channel *dst,
3375           const union tgsi_exec_channel *src0,
3376           const union tgsi_exec_channel *src1)
3377{
3378   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3379   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3380   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3381   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3382}
3383
3384static void
3385micro_usge(union tgsi_exec_channel *dst,
3386           const union tgsi_exec_channel *src0,
3387           const union tgsi_exec_channel *src1)
3388{
3389   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3390   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3391   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3392   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3393}
3394
3395static void
3396micro_ushr(union tgsi_exec_channel *dst,
3397           const union tgsi_exec_channel *src0,
3398           const union tgsi_exec_channel *src1)
3399{
3400   dst->u[0] = src0->u[0] >> src1->u[0];
3401   dst->u[1] = src0->u[1] >> src1->u[1];
3402   dst->u[2] = src0->u[2] >> src1->u[2];
3403   dst->u[3] = src0->u[3] >> src1->u[3];
3404}
3405
3406static void
3407micro_uslt(union tgsi_exec_channel *dst,
3408           const union tgsi_exec_channel *src0,
3409           const union tgsi_exec_channel *src1)
3410{
3411   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3412   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3413   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3414   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3415}
3416
3417static void
3418micro_usne(union tgsi_exec_channel *dst,
3419           const union tgsi_exec_channel *src0,
3420           const union tgsi_exec_channel *src1)
3421{
3422   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3423   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3424   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3425   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3426}
3427
3428static void
3429micro_uarl(union tgsi_exec_channel *dst,
3430           const union tgsi_exec_channel *src)
3431{
3432   dst->i[0] = src->u[0];
3433   dst->i[1] = src->u[1];
3434   dst->i[2] = src->u[2];
3435   dst->i[3] = src->u[3];
3436}
3437
3438static void
3439micro_ucmp(union tgsi_exec_channel *dst,
3440           const union tgsi_exec_channel *src0,
3441           const union tgsi_exec_channel *src1,
3442           const union tgsi_exec_channel *src2)
3443{
3444   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
3445   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
3446   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
3447   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
3448}
3449
3450static void
3451exec_instruction(
3452   struct tgsi_exec_machine *mach,
3453   const struct tgsi_full_instruction *inst,
3454   int *pc )
3455{
3456   union tgsi_exec_channel r[10];
3457
3458   (*pc)++;
3459
3460   switch (inst->Instruction.Opcode) {
3461   case TGSI_OPCODE_ARL:
3462      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3463      break;
3464
3465   case TGSI_OPCODE_MOV:
3466      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3467      break;
3468
3469   case TGSI_OPCODE_LIT:
3470      exec_lit(mach, inst);
3471      break;
3472
3473   case TGSI_OPCODE_RCP:
3474      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3475      break;
3476
3477   case TGSI_OPCODE_RSQ:
3478      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3479      break;
3480
3481   case TGSI_OPCODE_EXP:
3482      exec_exp(mach, inst);
3483      break;
3484
3485   case TGSI_OPCODE_LOG:
3486      exec_log(mach, inst);
3487      break;
3488
3489   case TGSI_OPCODE_MUL:
3490      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3491      break;
3492
3493   case TGSI_OPCODE_ADD:
3494      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3495      break;
3496
3497   case TGSI_OPCODE_DP3:
3498      exec_dp3(mach, inst);
3499      break;
3500
3501   case TGSI_OPCODE_DP4:
3502      exec_dp4(mach, inst);
3503      break;
3504
3505   case TGSI_OPCODE_DST:
3506      exec_dst(mach, inst);
3507      break;
3508
3509   case TGSI_OPCODE_MIN:
3510      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3511      break;
3512
3513   case TGSI_OPCODE_MAX:
3514      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3515      break;
3516
3517   case TGSI_OPCODE_SLT:
3518      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3519      break;
3520
3521   case TGSI_OPCODE_SGE:
3522      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3523      break;
3524
3525   case TGSI_OPCODE_MAD:
3526      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3527      break;
3528
3529   case TGSI_OPCODE_SUB:
3530      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3531      break;
3532
3533   case TGSI_OPCODE_LRP:
3534      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3535      break;
3536
3537   case TGSI_OPCODE_CND:
3538      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3539      break;
3540
3541   case TGSI_OPCODE_DP2A:
3542      exec_dp2a(mach, inst);
3543      break;
3544
3545   case TGSI_OPCODE_FRC:
3546      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3547      break;
3548
3549   case TGSI_OPCODE_CLAMP:
3550      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3551      break;
3552
3553   case TGSI_OPCODE_FLR:
3554      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3555      break;
3556
3557   case TGSI_OPCODE_ROUND:
3558      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3559      break;
3560
3561   case TGSI_OPCODE_EX2:
3562      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3563      break;
3564
3565   case TGSI_OPCODE_LG2:
3566      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3567      break;
3568
3569   case TGSI_OPCODE_POW:
3570      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3571      break;
3572
3573   case TGSI_OPCODE_XPD:
3574      exec_xpd(mach, inst);
3575      break;
3576
3577   case TGSI_OPCODE_ABS:
3578      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3579      break;
3580
3581   case TGSI_OPCODE_RCC:
3582      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3583      break;
3584
3585   case TGSI_OPCODE_DPH:
3586      exec_dph(mach, inst);
3587      break;
3588
3589   case TGSI_OPCODE_COS:
3590      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3591      break;
3592
3593   case TGSI_OPCODE_DDX:
3594      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3595      break;
3596
3597   case TGSI_OPCODE_DDY:
3598      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3599      break;
3600
3601   case TGSI_OPCODE_KILP:
3602      exec_kilp (mach, inst);
3603      break;
3604
3605   case TGSI_OPCODE_KIL:
3606      exec_kil (mach, inst);
3607      break;
3608
3609   case TGSI_OPCODE_PK2H:
3610      assert (0);
3611      break;
3612
3613   case TGSI_OPCODE_PK2US:
3614      assert (0);
3615      break;
3616
3617   case TGSI_OPCODE_PK4B:
3618      assert (0);
3619      break;
3620
3621   case TGSI_OPCODE_PK4UB:
3622      assert (0);
3623      break;
3624
3625   case TGSI_OPCODE_RFL:
3626      exec_rfl(mach, inst);
3627      break;
3628
3629   case TGSI_OPCODE_SEQ:
3630      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3631      break;
3632
3633   case TGSI_OPCODE_SFL:
3634      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3635      break;
3636
3637   case TGSI_OPCODE_SGT:
3638      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3639      break;
3640
3641   case TGSI_OPCODE_SIN:
3642      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3643      break;
3644
3645   case TGSI_OPCODE_SLE:
3646      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3647      break;
3648
3649   case TGSI_OPCODE_SNE:
3650      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3651      break;
3652
3653   case TGSI_OPCODE_STR:
3654      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3655      break;
3656
3657   case TGSI_OPCODE_TEX:
3658      /* simple texture lookup */
3659      /* src[0] = texcoord */
3660      /* src[1] = sampler unit */
3661      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3662      break;
3663
3664   case TGSI_OPCODE_TXB:
3665      /* Texture lookup with lod bias */
3666      /* src[0] = texcoord (src[0].w = LOD bias) */
3667      /* src[1] = sampler unit */
3668      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3669      break;
3670
3671   case TGSI_OPCODE_TXD:
3672      /* Texture lookup with explict partial derivatives */
3673      /* src[0] = texcoord */
3674      /* src[1] = d[strq]/dx */
3675      /* src[2] = d[strq]/dy */
3676      /* src[3] = sampler unit */
3677      exec_txd(mach, inst);
3678      break;
3679
3680   case TGSI_OPCODE_TXL:
3681      /* Texture lookup with explit LOD */
3682      /* src[0] = texcoord (src[0].w = LOD) */
3683      /* src[1] = sampler unit */
3684      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3685      break;
3686
3687   case TGSI_OPCODE_TXP:
3688      /* Texture lookup with projection */
3689      /* src[0] = texcoord (src[0].w = projection) */
3690      /* src[1] = sampler unit */
3691      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3692      break;
3693
3694   case TGSI_OPCODE_UP2H:
3695      assert (0);
3696      break;
3697
3698   case TGSI_OPCODE_UP2US:
3699      assert (0);
3700      break;
3701
3702   case TGSI_OPCODE_UP4B:
3703      assert (0);
3704      break;
3705
3706   case TGSI_OPCODE_UP4UB:
3707      assert (0);
3708      break;
3709
3710   case TGSI_OPCODE_X2D:
3711      exec_x2d(mach, inst);
3712      break;
3713
3714   case TGSI_OPCODE_ARA:
3715      assert (0);
3716      break;
3717
3718   case TGSI_OPCODE_ARR:
3719      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3720      break;
3721
3722   case TGSI_OPCODE_BRA:
3723      assert (0);
3724      break;
3725
3726   case TGSI_OPCODE_CAL:
3727      /* skip the call if no execution channels are enabled */
3728      if (mach->ExecMask) {
3729         /* do the call */
3730
3731         /* First, record the depths of the execution stacks.
3732          * This is important for deeply nested/looped return statements.
3733          * We have to unwind the stacks by the correct amount.  For a
3734          * real code generator, we could determine the number of entries
3735          * to pop off each stack with simple static analysis and avoid
3736          * implementing this data structure at run time.
3737          */
3738         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3739         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3740         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3741         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3742         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3743         /* note that PC was already incremented above */
3744         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3745
3746         mach->CallStackTop++;
3747
3748         /* Second, push the Cond, Loop, Cont, Func stacks */
3749         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3750         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3751         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3752         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3753         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3754         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3755
3756         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3757         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3758         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3759         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3760         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3761         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3762
3763         /* Finally, jump to the subroutine */
3764         *pc = inst->Label.Label;
3765      }
3766      break;
3767
3768   case TGSI_OPCODE_RET:
3769      mach->FuncMask &= ~mach->ExecMask;
3770      UPDATE_EXEC_MASK(mach);
3771
3772      if (mach->FuncMask == 0x0) {
3773         /* really return now (otherwise, keep executing */
3774
3775         if (mach->CallStackTop == 0) {
3776            /* returning from main() */
3777            mach->CondStackTop = 0;
3778            mach->LoopStackTop = 0;
3779            *pc = -1;
3780            return;
3781         }
3782
3783         assert(mach->CallStackTop > 0);
3784         mach->CallStackTop--;
3785
3786         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3787         mach->CondMask = mach->CondStack[mach->CondStackTop];
3788
3789         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3790         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3791
3792         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3793         mach->ContMask = mach->ContStack[mach->ContStackTop];
3794
3795         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3796         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3797
3798         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3799         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3800
3801         assert(mach->FuncStackTop > 0);
3802         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3803
3804         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3805
3806         UPDATE_EXEC_MASK(mach);
3807      }
3808      break;
3809
3810   case TGSI_OPCODE_SSG:
3811      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3812      break;
3813
3814   case TGSI_OPCODE_CMP:
3815      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3816      break;
3817
3818   case TGSI_OPCODE_SCS:
3819      exec_scs(mach, inst);
3820      break;
3821
3822   case TGSI_OPCODE_NRM:
3823      exec_nrm3(mach, inst);
3824      break;
3825
3826   case TGSI_OPCODE_NRM4:
3827      exec_nrm4(mach, inst);
3828      break;
3829
3830   case TGSI_OPCODE_DIV:
3831      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3832      break;
3833
3834   case TGSI_OPCODE_DP2:
3835      exec_dp2(mach, inst);
3836      break;
3837
3838   case TGSI_OPCODE_IF:
3839      /* push CondMask */
3840      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3841      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3842      FETCH( &r[0], 0, CHAN_X );
3843      /* update CondMask */
3844      if( ! r[0].u[0] ) {
3845         mach->CondMask &= ~0x1;
3846      }
3847      if( ! r[0].u[1] ) {
3848         mach->CondMask &= ~0x2;
3849      }
3850      if( ! r[0].u[2] ) {
3851         mach->CondMask &= ~0x4;
3852      }
3853      if( ! r[0].u[3] ) {
3854         mach->CondMask &= ~0x8;
3855      }
3856      UPDATE_EXEC_MASK(mach);
3857      /* Todo: If CondMask==0, jump to ELSE */
3858      break;
3859
3860   case TGSI_OPCODE_ELSE:
3861      /* invert CondMask wrt previous mask */
3862      {
3863         uint prevMask;
3864         assert(mach->CondStackTop > 0);
3865         prevMask = mach->CondStack[mach->CondStackTop - 1];
3866         mach->CondMask = ~mach->CondMask & prevMask;
3867         UPDATE_EXEC_MASK(mach);
3868         /* Todo: If CondMask==0, jump to ENDIF */
3869      }
3870      break;
3871
3872   case TGSI_OPCODE_ENDIF:
3873      /* pop CondMask */
3874      assert(mach->CondStackTop > 0);
3875      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3876      UPDATE_EXEC_MASK(mach);
3877      break;
3878
3879   case TGSI_OPCODE_END:
3880      /* make sure we end primitives which haven't
3881       * been explicitly emitted */
3882      conditional_emit_primitive(mach);
3883      /* halt execution */
3884      *pc = -1;
3885      break;
3886
3887   case TGSI_OPCODE_PUSHA:
3888      assert (0);
3889      break;
3890
3891   case TGSI_OPCODE_POPA:
3892      assert (0);
3893      break;
3894
3895   case TGSI_OPCODE_CEIL:
3896      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3897      break;
3898
3899   case TGSI_OPCODE_I2F:
3900      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3901      break;
3902
3903   case TGSI_OPCODE_NOT:
3904      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3905      break;
3906
3907   case TGSI_OPCODE_TRUNC:
3908      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3909      break;
3910
3911   case TGSI_OPCODE_SHL:
3912      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3913      break;
3914
3915   case TGSI_OPCODE_AND:
3916      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3917      break;
3918
3919   case TGSI_OPCODE_OR:
3920      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3921      break;
3922
3923   case TGSI_OPCODE_MOD:
3924      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3925      break;
3926
3927   case TGSI_OPCODE_XOR:
3928      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3929      break;
3930
3931   case TGSI_OPCODE_SAD:
3932      assert (0);
3933      break;
3934
3935   case TGSI_OPCODE_TXF:
3936      exec_txf(mach, inst);
3937      break;
3938
3939   case TGSI_OPCODE_TXQ:
3940      exec_txq(mach, inst);
3941      break;
3942
3943   case TGSI_OPCODE_EMIT:
3944      emit_vertex(mach);
3945      break;
3946
3947   case TGSI_OPCODE_ENDPRIM:
3948      emit_primitive(mach);
3949      break;
3950
3951   case TGSI_OPCODE_BGNLOOP:
3952      /* push LoopMask and ContMasks */
3953      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3954      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3955      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3956      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3957
3958      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3959      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3960      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3961      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3962      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3963      break;
3964
3965   case TGSI_OPCODE_ENDLOOP:
3966      /* Restore ContMask, but don't pop */
3967      assert(mach->ContStackTop > 0);
3968      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3969      UPDATE_EXEC_MASK(mach);
3970      if (mach->ExecMask) {
3971         /* repeat loop: jump to instruction just past BGNLOOP */
3972         assert(mach->LoopLabelStackTop > 0);
3973         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3974      }
3975      else {
3976         /* exit loop: pop LoopMask */
3977         assert(mach->LoopStackTop > 0);
3978         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3979         /* pop ContMask */
3980         assert(mach->ContStackTop > 0);
3981         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3982         assert(mach->LoopLabelStackTop > 0);
3983         --mach->LoopLabelStackTop;
3984
3985         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3986      }
3987      UPDATE_EXEC_MASK(mach);
3988      break;
3989
3990   case TGSI_OPCODE_BRK:
3991      exec_break(mach);
3992      break;
3993
3994   case TGSI_OPCODE_CONT:
3995      /* turn off cont channels for each enabled exec channel */
3996      mach->ContMask &= ~mach->ExecMask;
3997      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3998      UPDATE_EXEC_MASK(mach);
3999      break;
4000
4001   case TGSI_OPCODE_BGNSUB:
4002      /* no-op */
4003      break;
4004
4005   case TGSI_OPCODE_ENDSUB:
4006      /*
4007       * XXX: This really should be a no-op. We should never reach this opcode.
4008       */
4009
4010      assert(mach->CallStackTop > 0);
4011      mach->CallStackTop--;
4012
4013      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
4014      mach->CondMask = mach->CondStack[mach->CondStackTop];
4015
4016      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
4017      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
4018
4019      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
4020      mach->ContMask = mach->ContStack[mach->ContStackTop];
4021
4022      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
4023      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
4024
4025      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
4026      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
4027
4028      assert(mach->FuncStackTop > 0);
4029      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
4030
4031      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
4032
4033      UPDATE_EXEC_MASK(mach);
4034      break;
4035
4036   case TGSI_OPCODE_NOP:
4037      break;
4038
4039   case TGSI_OPCODE_BREAKC:
4040      FETCH(&r[0], 0, CHAN_X);
4041      /* update CondMask */
4042      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
4043         mach->LoopMask &= ~0x1;
4044      }
4045      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
4046         mach->LoopMask &= ~0x2;
4047      }
4048      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
4049         mach->LoopMask &= ~0x4;
4050      }
4051      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
4052         mach->LoopMask &= ~0x8;
4053      }
4054      /* Todo: if mach->LoopMask == 0, jump to end of loop */
4055      UPDATE_EXEC_MASK(mach);
4056      break;
4057
4058   case TGSI_OPCODE_F2I:
4059      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
4060      break;
4061
4062   case TGSI_OPCODE_IDIV:
4063      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4064      break;
4065
4066   case TGSI_OPCODE_IMAX:
4067      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4068      break;
4069
4070   case TGSI_OPCODE_IMIN:
4071      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4072      break;
4073
4074   case TGSI_OPCODE_INEG:
4075      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4076      break;
4077
4078   case TGSI_OPCODE_ISGE:
4079      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4080      break;
4081
4082   case TGSI_OPCODE_ISHR:
4083      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4084      break;
4085
4086   case TGSI_OPCODE_ISLT:
4087      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4088      break;
4089
4090   case TGSI_OPCODE_F2U:
4091      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
4092      break;
4093
4094   case TGSI_OPCODE_U2F:
4095      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
4096      break;
4097
4098   case TGSI_OPCODE_UADD:
4099      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4100      break;
4101
4102   case TGSI_OPCODE_UDIV:
4103      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4104      break;
4105
4106   case TGSI_OPCODE_UMAD:
4107      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4108      break;
4109
4110   case TGSI_OPCODE_UMAX:
4111      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4112      break;
4113
4114   case TGSI_OPCODE_UMIN:
4115      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4116      break;
4117
4118   case TGSI_OPCODE_UMOD:
4119      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4120      break;
4121
4122   case TGSI_OPCODE_UMUL:
4123      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4124      break;
4125
4126   case TGSI_OPCODE_USEQ:
4127      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4128      break;
4129
4130   case TGSI_OPCODE_USGE:
4131      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4132      break;
4133
4134   case TGSI_OPCODE_USHR:
4135      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4136      break;
4137
4138   case TGSI_OPCODE_USLT:
4139      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4140      break;
4141
4142   case TGSI_OPCODE_USNE:
4143      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4144      break;
4145
4146   case TGSI_OPCODE_SWITCH:
4147      exec_switch(mach, inst);
4148      break;
4149
4150   case TGSI_OPCODE_CASE:
4151      exec_case(mach, inst);
4152      break;
4153
4154   case TGSI_OPCODE_DEFAULT:
4155      exec_default(mach);
4156      break;
4157
4158   case TGSI_OPCODE_ENDSWITCH:
4159      exec_endswitch(mach);
4160      break;
4161
4162   case TGSI_OPCODE_LOAD:
4163      assert(0);
4164      break;
4165
4166   case TGSI_OPCODE_LOAD_MS:
4167      assert(0);
4168      break;
4169
4170   case TGSI_OPCODE_SAMPLE:
4171      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4172      break;
4173
4174   case TGSI_OPCODE_SAMPLE_B:
4175      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4176      break;
4177
4178   case TGSI_OPCODE_SAMPLE_C:
4179      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4180      break;
4181
4182   case TGSI_OPCODE_SAMPLE_C_LZ:
4183      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4184      break;
4185
4186   case TGSI_OPCODE_SAMPLE_D:
4187      exec_sample_d(mach, inst);
4188      break;
4189
4190   case TGSI_OPCODE_SAMPLE_L:
4191      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
4192      break;
4193
4194   case TGSI_OPCODE_GATHER4:
4195      assert(0);
4196      break;
4197
4198   case TGSI_OPCODE_RESINFO:
4199      assert(0);
4200      break;
4201
4202   case TGSI_OPCODE_SAMPLE_POS:
4203      assert(0);
4204      break;
4205
4206   case TGSI_OPCODE_SAMPLE_INFO:
4207      assert(0);
4208      break;
4209
4210   case TGSI_OPCODE_UARL:
4211      exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
4212      break;
4213
4214   case TGSI_OPCODE_UCMP:
4215      exec_vector_trinary(mach, inst, micro_ucmp, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4216      break;
4217
4218   case TGSI_OPCODE_IABS:
4219      exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4220      break;
4221
4222   case TGSI_OPCODE_ISSG:
4223      exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4224      break;
4225
4226   default:
4227      assert( 0 );
4228   }
4229}
4230
4231
4232#define DEBUG_EXECUTION 0
4233
4234
4235/**
4236 * Run TGSI interpreter.
4237 * \return bitmask of "alive" quad components
4238 */
4239uint
4240tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
4241{
4242   uint i;
4243   int pc = 0;
4244
4245   mach->CondMask = 0xf;
4246   mach->LoopMask = 0xf;
4247   mach->ContMask = 0xf;
4248   mach->FuncMask = 0xf;
4249   mach->ExecMask = 0xf;
4250
4251   mach->Switch.mask = 0xf;
4252
4253   assert(mach->CondStackTop == 0);
4254   assert(mach->LoopStackTop == 0);
4255   assert(mach->ContStackTop == 0);
4256   assert(mach->SwitchStackTop == 0);
4257   assert(mach->BreakStackTop == 0);
4258   assert(mach->CallStackTop == 0);
4259
4260   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
4261   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
4262
4263   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
4264      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
4265      mach->Primitives[0] = 0;
4266   }
4267
4268   /* execute declarations (interpolants) */
4269   for (i = 0; i < mach->NumDeclarations; i++) {
4270      exec_declaration( mach, mach->Declarations+i );
4271   }
4272
4273   {
4274#if DEBUG_EXECUTION
4275      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
4276      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
4277      uint inst = 1;
4278
4279      memcpy(temps, mach->Temps, sizeof(temps));
4280      memcpy(outputs, mach->Outputs, sizeof(outputs));
4281#endif
4282
4283      /* execute instructions, until pc is set to -1 */
4284      while (pc != -1) {
4285
4286#if DEBUG_EXECUTION
4287         uint i;
4288
4289         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
4290#endif
4291
4292         assert(pc < (int) mach->NumInstructions);
4293         exec_instruction(mach, mach->Instructions + pc, &pc);
4294
4295#if DEBUG_EXECUTION
4296         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4297            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4298               uint j;
4299
4300               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4301               debug_printf("TEMP[%2u] = ", i);
4302               for (j = 0; j < 4; j++) {
4303                  if (j > 0) {
4304                     debug_printf("           ");
4305                  }
4306                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4307                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4308                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4309                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4310                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4311               }
4312            }
4313         }
4314         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4315            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4316               uint j;
4317
4318               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4319               debug_printf("OUT[%2u] =  ", i);
4320               for (j = 0; j < 4; j++) {
4321                  if (j > 0) {
4322                     debug_printf("           ");
4323                  }
4324                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4325                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4326                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4327                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4328                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4329               }
4330            }
4331         }
4332#endif
4333      }
4334   }
4335
4336#if 0
4337   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4338   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4339      /*
4340       * Scale back depth component.
4341       */
4342      for (i = 0; i < 4; i++)
4343         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4344   }
4345#endif
4346
4347   /* Strictly speaking, these assertions aren't really needed but they
4348    * can potentially catch some bugs in the control flow code.
4349    */
4350   assert(mach->CondStackTop == 0);
4351   assert(mach->LoopStackTop == 0);
4352   assert(mach->ContStackTop == 0);
4353   assert(mach->SwitchStackTop == 0);
4354   assert(mach->BreakStackTop == 0);
4355   assert(mach->CallStackTop == 0);
4356
4357   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4358}
4359