tgsi_exec.c revision b6cbc28533a3fd68dbfe694c0774735233df8758
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 0
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_isgn(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src)
380{
381   dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
382   dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
383   dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
384   dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
385}
386
387static void
388micro_sgt(union tgsi_exec_channel *dst,
389          const union tgsi_exec_channel *src0,
390          const union tgsi_exec_channel *src1)
391{
392   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
393   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
394   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
395   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
396}
397
398static void
399micro_sin(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src)
401{
402   dst->f[0] = sinf(src->f[0]);
403   dst->f[1] = sinf(src->f[1]);
404   dst->f[2] = sinf(src->f[2]);
405   dst->f[3] = sinf(src->f[3]);
406}
407
408static void
409micro_sle(union tgsi_exec_channel *dst,
410          const union tgsi_exec_channel *src0,
411          const union tgsi_exec_channel *src1)
412{
413   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
414   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
415   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
416   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
417}
418
419static void
420micro_slt(union tgsi_exec_channel *dst,
421          const union tgsi_exec_channel *src0,
422          const union tgsi_exec_channel *src1)
423{
424   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
425   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
426   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
427   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
428}
429
430static void
431micro_sne(union tgsi_exec_channel *dst,
432          const union tgsi_exec_channel *src0,
433          const union tgsi_exec_channel *src1)
434{
435   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
436   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
437   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
438   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
439}
440
441static void
442micro_sfl(union tgsi_exec_channel *dst)
443{
444   dst->f[0] = 0.0f;
445   dst->f[1] = 0.0f;
446   dst->f[2] = 0.0f;
447   dst->f[3] = 0.0f;
448}
449
450static void
451micro_str(union tgsi_exec_channel *dst)
452{
453   dst->f[0] = 1.0f;
454   dst->f[1] = 1.0f;
455   dst->f[2] = 1.0f;
456   dst->f[3] = 1.0f;
457}
458
459static void
460micro_trunc(union tgsi_exec_channel *dst,
461            const union tgsi_exec_channel *src)
462{
463   dst->f[0] = (float)(int)src->f[0];
464   dst->f[1] = (float)(int)src->f[1];
465   dst->f[2] = (float)(int)src->f[2];
466   dst->f[3] = (float)(int)src->f[3];
467}
468
469
470#define CHAN_X  0
471#define CHAN_Y  1
472#define CHAN_Z  2
473#define CHAN_W  3
474
475enum tgsi_exec_datatype {
476   TGSI_EXEC_DATA_FLOAT,
477   TGSI_EXEC_DATA_INT,
478   TGSI_EXEC_DATA_UINT
479};
480
481/*
482 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
483 */
484#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
485#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
486#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
487#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
488#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
489#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
490
491
492/** The execution mask depends on the conditional mask and the loop mask */
493#define UPDATE_EXEC_MASK(MACH) \
494      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
495
496
497static const union tgsi_exec_channel ZeroVec =
498   { { 0.0, 0.0, 0.0, 0.0 } };
499
500static const union tgsi_exec_channel OneVec = {
501   {1.0f, 1.0f, 1.0f, 1.0f}
502};
503
504static const union tgsi_exec_channel P128Vec = {
505   {128.0f, 128.0f, 128.0f, 128.0f}
506};
507
508static const union tgsi_exec_channel M128Vec = {
509   {-128.0f, -128.0f, -128.0f, -128.0f}
510};
511
512
513/**
514 * Assert that none of the float values in 'chan' are infinite or NaN.
515 * NaN and Inf may occur normally during program execution and should
516 * not lead to crashes, etc.  But when debugging, it's helpful to catch
517 * them.
518 */
519static INLINE void
520check_inf_or_nan(const union tgsi_exec_channel *chan)
521{
522   assert(!util_is_inf_or_nan((chan)->f[0]));
523   assert(!util_is_inf_or_nan((chan)->f[1]));
524   assert(!util_is_inf_or_nan((chan)->f[2]));
525   assert(!util_is_inf_or_nan((chan)->f[3]));
526}
527
528
529#ifdef DEBUG
530static void
531print_chan(const char *msg, const union tgsi_exec_channel *chan)
532{
533   debug_printf("%s = {%f, %f, %f, %f}\n",
534                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
535}
536#endif
537
538
539#ifdef DEBUG
540static void
541print_temp(const struct tgsi_exec_machine *mach, uint index)
542{
543   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
544   int i;
545   debug_printf("Temp[%u] =\n", index);
546   for (i = 0; i < 4; i++) {
547      debug_printf("  %c: { %f, %f, %f, %f }\n",
548                   "XYZW"[i],
549                   tmp->xyzw[i].f[0],
550                   tmp->xyzw[i].f[1],
551                   tmp->xyzw[i].f[2],
552                   tmp->xyzw[i].f[3]);
553   }
554}
555#endif
556
557
558void
559tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
560                               unsigned num_bufs,
561                               const void **bufs,
562                               const unsigned *buf_sizes)
563{
564   unsigned i;
565
566   for (i = 0; i < num_bufs; i++) {
567      mach->Consts[i] = bufs[i];
568      mach->ConstsSize[i] = buf_sizes[i];
569   }
570}
571
572
573/**
574 * Check if there's a potential src/dst register data dependency when
575 * using SOA execution.
576 * Example:
577 *   MOV T, T.yxwz;
578 * This would expand into:
579 *   MOV t0, t1;
580 *   MOV t1, t0;
581 *   MOV t2, t3;
582 *   MOV t3, t2;
583 * The second instruction will have the wrong value for t0 if executed as-is.
584 */
585boolean
586tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
587{
588   uint i, chan;
589
590   uint writemask = inst->Dst[0].Register.WriteMask;
591   if (writemask == TGSI_WRITEMASK_X ||
592       writemask == TGSI_WRITEMASK_Y ||
593       writemask == TGSI_WRITEMASK_Z ||
594       writemask == TGSI_WRITEMASK_W ||
595       writemask == TGSI_WRITEMASK_NONE) {
596      /* no chance of data dependency */
597      return FALSE;
598   }
599
600   /* loop over src regs */
601   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
602      if ((inst->Src[i].Register.File ==
603           inst->Dst[0].Register.File) &&
604          ((inst->Src[i].Register.Index ==
605            inst->Dst[0].Register.Index) ||
606           inst->Src[i].Register.Indirect ||
607           inst->Dst[0].Register.Indirect)) {
608         /* loop over dest channels */
609         uint channelsWritten = 0x0;
610         for (chan = 0; chan < NUM_CHANNELS; chan++) {
611            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
612               /* check if we're reading a channel that's been written */
613               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
614               if (channelsWritten & (1 << swizzle)) {
615                  return TRUE;
616               }
617
618               channelsWritten |= (1 << chan);
619            }
620         }
621      }
622   }
623   return FALSE;
624}
625
626
627/**
628 * Initialize machine state by expanding tokens to full instructions,
629 * allocating temporary storage, setting up constants, etc.
630 * After this, we can call tgsi_exec_machine_run() many times.
631 */
632void
633tgsi_exec_machine_bind_shader(
634   struct tgsi_exec_machine *mach,
635   const struct tgsi_token *tokens,
636   uint numSamplers,
637   struct tgsi_sampler **samplers)
638{
639   uint k;
640   struct tgsi_parse_context parse;
641   struct tgsi_full_instruction *instructions;
642   struct tgsi_full_declaration *declarations;
643   uint maxInstructions = 10, numInstructions = 0;
644   uint maxDeclarations = 10, numDeclarations = 0;
645
646#if 0
647   tgsi_dump(tokens, 0);
648#endif
649
650   util_init_math();
651
652   if (numSamplers) {
653      assert(samplers);
654   }
655
656   mach->Tokens = tokens;
657   mach->Samplers = samplers;
658
659   if (!tokens) {
660      /* unbind and free all */
661      if (mach->Declarations) {
662         FREE( mach->Declarations );
663      }
664      mach->Declarations = NULL;
665      mach->NumDeclarations = 0;
666
667      if (mach->Instructions) {
668         FREE( mach->Instructions );
669      }
670      mach->Instructions = NULL;
671      mach->NumInstructions = 0;
672
673      return;
674   }
675
676   k = tgsi_parse_init (&parse, mach->Tokens);
677   if (k != TGSI_PARSE_OK) {
678      debug_printf( "Problem parsing!\n" );
679      return;
680   }
681
682   mach->Processor = parse.FullHeader.Processor.Processor;
683   mach->ImmLimit = 0;
684
685   if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
686       !mach->UsedGeometryShader) {
687      struct tgsi_exec_vector *inputs;
688      struct tgsi_exec_vector *outputs;
689
690      inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
691                            TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
692                            16);
693
694      if (!inputs)
695         return;
696
697      outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
698                             TGSI_MAX_TOTAL_VERTICES, 16);
699
700      if (!outputs) {
701         align_free(inputs);
702         return;
703      }
704
705      align_free(mach->Inputs);
706      align_free(mach->Outputs);
707
708      mach->Inputs = inputs;
709      mach->Outputs = outputs;
710      mach->UsedGeometryShader = TRUE;
711   }
712
713   declarations = (struct tgsi_full_declaration *)
714      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
715
716   if (!declarations) {
717      return;
718   }
719
720   instructions = (struct tgsi_full_instruction *)
721      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
722
723   if (!instructions) {
724      FREE( declarations );
725      return;
726   }
727
728   while( !tgsi_parse_end_of_tokens( &parse ) ) {
729      uint i;
730
731      tgsi_parse_token( &parse );
732      switch( parse.FullToken.Token.Type ) {
733      case TGSI_TOKEN_TYPE_DECLARATION:
734         /* save expanded declaration */
735         if (numDeclarations == maxDeclarations) {
736            declarations = REALLOC(declarations,
737                                   maxDeclarations
738                                   * sizeof(struct tgsi_full_declaration),
739                                   (maxDeclarations + 10)
740                                   * sizeof(struct tgsi_full_declaration));
741            maxDeclarations += 10;
742         }
743         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
744            unsigned reg;
745            for (reg = parse.FullToken.FullDeclaration.Range.First;
746                 reg <= parse.FullToken.FullDeclaration.Range.Last;
747                 ++reg) {
748               ++mach->NumOutputs;
749            }
750         }
751         if (parse.FullToken.FullDeclaration.Declaration.File ==
752             TGSI_FILE_IMMEDIATE_ARRAY) {
753            unsigned reg;
754            struct tgsi_full_declaration *decl =
755               &parse.FullToken.FullDeclaration;
756            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
757            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
758               for( i = 0; i < 4; i++ ) {
759                  int idx = reg * 4 + i;
760                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
761               }
762            }
763         }
764         memcpy(declarations + numDeclarations,
765                &parse.FullToken.FullDeclaration,
766                sizeof(declarations[0]));
767         numDeclarations++;
768         break;
769
770      case TGSI_TOKEN_TYPE_IMMEDIATE:
771         {
772            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
773            assert( size <= 4 );
774            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
775
776            for( i = 0; i < size; i++ ) {
777               mach->Imms[mach->ImmLimit][i] =
778		  parse.FullToken.FullImmediate.u[i].Float;
779            }
780            mach->ImmLimit += 1;
781         }
782         break;
783
784      case TGSI_TOKEN_TYPE_INSTRUCTION:
785
786         /* save expanded instruction */
787         if (numInstructions == maxInstructions) {
788            instructions = REALLOC(instructions,
789                                   maxInstructions
790                                   * sizeof(struct tgsi_full_instruction),
791                                   (maxInstructions + 10)
792                                   * sizeof(struct tgsi_full_instruction));
793            maxInstructions += 10;
794         }
795
796         memcpy(instructions + numInstructions,
797                &parse.FullToken.FullInstruction,
798                sizeof(instructions[0]));
799
800         numInstructions++;
801         break;
802
803      case TGSI_TOKEN_TYPE_PROPERTY:
804         break;
805
806      default:
807         assert( 0 );
808      }
809   }
810   tgsi_parse_free (&parse);
811
812   if (mach->Declarations) {
813      FREE( mach->Declarations );
814   }
815   mach->Declarations = declarations;
816   mach->NumDeclarations = numDeclarations;
817
818   if (mach->Instructions) {
819      FREE( mach->Instructions );
820   }
821   mach->Instructions = instructions;
822   mach->NumInstructions = numInstructions;
823}
824
825
826struct tgsi_exec_machine *
827tgsi_exec_machine_create( void )
828{
829   struct tgsi_exec_machine *mach;
830   uint i;
831
832   mach = align_malloc( sizeof *mach, 16 );
833   if (!mach)
834      goto fail;
835
836   memset(mach, 0, sizeof(*mach));
837
838   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
839   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
840   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
841
842   mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
843   mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
844   if (!mach->Inputs || !mach->Outputs)
845      goto fail;
846
847   /* Setup constants needed by the SSE2 executor. */
848   for( i = 0; i < 4; i++ ) {
849      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
850      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
851      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
852      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
853      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
854      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
855      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
856      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
857      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
858      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
859   }
860
861#ifdef DEBUG
862   /* silence warnings */
863   (void) print_chan;
864   (void) print_temp;
865#endif
866
867   return mach;
868
869fail:
870   if (mach) {
871      align_free(mach->Inputs);
872      align_free(mach->Outputs);
873      align_free(mach);
874   }
875   return NULL;
876}
877
878
879void
880tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
881{
882   if (mach) {
883      if (mach->Instructions)
884         FREE(mach->Instructions);
885      if (mach->Declarations)
886         FREE(mach->Declarations);
887
888      align_free(mach->Inputs);
889      align_free(mach->Outputs);
890
891      align_free(mach);
892   }
893}
894
895static void
896micro_add(union tgsi_exec_channel *dst,
897          const union tgsi_exec_channel *src0,
898          const union tgsi_exec_channel *src1)
899{
900   dst->f[0] = src0->f[0] + src1->f[0];
901   dst->f[1] = src0->f[1] + src1->f[1];
902   dst->f[2] = src0->f[2] + src1->f[2];
903   dst->f[3] = src0->f[3] + src1->f[3];
904}
905
906static void
907micro_div(
908   union tgsi_exec_channel *dst,
909   const union tgsi_exec_channel *src0,
910   const union tgsi_exec_channel *src1 )
911{
912   if (src1->f[0] != 0) {
913      dst->f[0] = src0->f[0] / src1->f[0];
914   }
915   if (src1->f[1] != 0) {
916      dst->f[1] = src0->f[1] / src1->f[1];
917   }
918   if (src1->f[2] != 0) {
919      dst->f[2] = src0->f[2] / src1->f[2];
920   }
921   if (src1->f[3] != 0) {
922      dst->f[3] = src0->f[3] / src1->f[3];
923   }
924}
925
926static void
927micro_rcc(union tgsi_exec_channel *dst,
928          const union tgsi_exec_channel *src)
929{
930   uint i;
931
932   for (i = 0; i < 4; i++) {
933      float recip = 1.0f / src->f[i];
934
935      if (recip > 0.0f) {
936         if (recip > 1.884467e+019f) {
937            dst->f[i] = 1.884467e+019f;
938         }
939         else if (recip < 5.42101e-020f) {
940            dst->f[i] = 5.42101e-020f;
941         }
942         else {
943            dst->f[i] = recip;
944         }
945      }
946      else {
947         if (recip < -1.884467e+019f) {
948            dst->f[i] = -1.884467e+019f;
949         }
950         else if (recip > -5.42101e-020f) {
951            dst->f[i] = -5.42101e-020f;
952         }
953         else {
954            dst->f[i] = recip;
955         }
956      }
957   }
958}
959
960static void
961micro_lt(
962   union tgsi_exec_channel *dst,
963   const union tgsi_exec_channel *src0,
964   const union tgsi_exec_channel *src1,
965   const union tgsi_exec_channel *src2,
966   const union tgsi_exec_channel *src3 )
967{
968   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
969   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
970   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
971   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
972}
973
974static void
975micro_max(union tgsi_exec_channel *dst,
976          const union tgsi_exec_channel *src0,
977          const union tgsi_exec_channel *src1)
978{
979   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
980   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
981   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
982   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
983}
984
985static void
986micro_min(union tgsi_exec_channel *dst,
987          const union tgsi_exec_channel *src0,
988          const union tgsi_exec_channel *src1)
989{
990   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
991   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
992   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
993   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
994}
995
996static void
997micro_mul(union tgsi_exec_channel *dst,
998          const union tgsi_exec_channel *src0,
999          const union tgsi_exec_channel *src1)
1000{
1001   dst->f[0] = src0->f[0] * src1->f[0];
1002   dst->f[1] = src0->f[1] * src1->f[1];
1003   dst->f[2] = src0->f[2] * src1->f[2];
1004   dst->f[3] = src0->f[3] * src1->f[3];
1005}
1006
1007static void
1008micro_neg(
1009   union tgsi_exec_channel *dst,
1010   const union tgsi_exec_channel *src )
1011{
1012   dst->f[0] = -src->f[0];
1013   dst->f[1] = -src->f[1];
1014   dst->f[2] = -src->f[2];
1015   dst->f[3] = -src->f[3];
1016}
1017
1018static void
1019micro_pow(
1020   union tgsi_exec_channel *dst,
1021   const union tgsi_exec_channel *src0,
1022   const union tgsi_exec_channel *src1 )
1023{
1024#if FAST_MATH
1025   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1026   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1027   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1028   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1029#else
1030   dst->f[0] = powf( src0->f[0], src1->f[0] );
1031   dst->f[1] = powf( src0->f[1], src1->f[1] );
1032   dst->f[2] = powf( src0->f[2], src1->f[2] );
1033   dst->f[3] = powf( src0->f[3], src1->f[3] );
1034#endif
1035}
1036
1037static void
1038micro_sub(union tgsi_exec_channel *dst,
1039          const union tgsi_exec_channel *src0,
1040          const union tgsi_exec_channel *src1)
1041{
1042   dst->f[0] = src0->f[0] - src1->f[0];
1043   dst->f[1] = src0->f[1] - src1->f[1];
1044   dst->f[2] = src0->f[2] - src1->f[2];
1045   dst->f[3] = src0->f[3] - src1->f[3];
1046}
1047
1048static void
1049fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1050                       const uint chan_index,
1051                       const uint file,
1052                       const uint swizzle,
1053                       const union tgsi_exec_channel *index,
1054                       const union tgsi_exec_channel *index2D,
1055                       union tgsi_exec_channel *chan)
1056{
1057   uint i;
1058
1059   assert(swizzle < 4);
1060
1061   switch (file) {
1062   case TGSI_FILE_CONSTANT:
1063      for (i = 0; i < QUAD_SIZE; i++) {
1064         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1065         assert(mach->Consts[index2D->i[i]]);
1066
1067         if (index->i[i] < 0) {
1068            chan->u[i] = 0;
1069         } else {
1070            /* NOTE: copying the const value as a uint instead of float */
1071            const uint constbuf = index2D->i[i];
1072            const uint *buf = (const uint *)mach->Consts[constbuf];
1073            const int pos = index->i[i] * 4 + swizzle;
1074            /* const buffer bounds check */
1075            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1076               if (0) {
1077                  /* Debug: print warning */
1078                  static int count = 0;
1079                  if (count++ < 100)
1080                     debug_printf("TGSI Exec: const buffer index %d"
1081                                  " out of bounds\n", pos);
1082               }
1083               chan->u[i] = 0;
1084            }
1085            else
1086               chan->u[i] = buf[pos];
1087         }
1088      }
1089      break;
1090
1091   case TGSI_FILE_INPUT:
1092      for (i = 0; i < QUAD_SIZE; i++) {
1093         /*
1094         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1095            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1096                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1097                         index2D->i[i], index->i[i]);
1098                         }*/
1099         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1100         assert(pos >= 0);
1101         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1102         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1103      }
1104      break;
1105
1106   case TGSI_FILE_SYSTEM_VALUE:
1107      /* XXX no swizzling at this point.  Will be needed if we put
1108       * gl_FragCoord, for example, in a sys value register.
1109       */
1110      for (i = 0; i < QUAD_SIZE; i++) {
1111         chan->u[i] = mach->SystemValue[index->i[i]].u[i];
1112      }
1113      break;
1114
1115   case TGSI_FILE_TEMPORARY:
1116      for (i = 0; i < QUAD_SIZE; i++) {
1117         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1118         assert(index2D->i[i] == 0);
1119
1120         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1121      }
1122      break;
1123
1124   case TGSI_FILE_TEMPORARY_ARRAY:
1125      for (i = 0; i < QUAD_SIZE; i++) {
1126         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1127         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1128
1129         chan->u[i] =
1130            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1131      }
1132      break;
1133
1134   case TGSI_FILE_IMMEDIATE:
1135      for (i = 0; i < QUAD_SIZE; i++) {
1136         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1137         assert(index2D->i[i] == 0);
1138
1139         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1140      }
1141      break;
1142
1143   case TGSI_FILE_IMMEDIATE_ARRAY:
1144      for (i = 0; i < QUAD_SIZE; i++) {
1145         assert(index2D->i[i] == 0);
1146
1147         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1148      }
1149      break;
1150
1151   case TGSI_FILE_ADDRESS:
1152      for (i = 0; i < QUAD_SIZE; i++) {
1153         assert(index->i[i] >= 0);
1154         assert(index2D->i[i] == 0);
1155
1156         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1157      }
1158      break;
1159
1160   case TGSI_FILE_PREDICATE:
1161      for (i = 0; i < QUAD_SIZE; i++) {
1162         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1163         assert(index2D->i[i] == 0);
1164
1165         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1166      }
1167      break;
1168
1169   case TGSI_FILE_OUTPUT:
1170      /* vertex/fragment output vars can be read too */
1171      for (i = 0; i < QUAD_SIZE; i++) {
1172         assert(index->i[i] >= 0);
1173         assert(index2D->i[i] == 0);
1174
1175         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1176      }
1177      break;
1178
1179   default:
1180      assert(0);
1181      for (i = 0; i < QUAD_SIZE; i++) {
1182         chan->u[i] = 0;
1183      }
1184   }
1185}
1186
1187static void
1188fetch_source(const struct tgsi_exec_machine *mach,
1189             union tgsi_exec_channel *chan,
1190             const struct tgsi_full_src_register *reg,
1191             const uint chan_index,
1192             enum tgsi_exec_datatype src_datatype)
1193{
1194   union tgsi_exec_channel index;
1195   union tgsi_exec_channel index2D;
1196   uint swizzle;
1197
1198   /* We start with a direct index into a register file.
1199    *
1200    *    file[1],
1201    *    where:
1202    *       file = Register.File
1203    *       [1] = Register.Index
1204    */
1205   index.i[0] =
1206   index.i[1] =
1207   index.i[2] =
1208   index.i[3] = reg->Register.Index;
1209
1210   /* There is an extra source register that indirectly subscripts
1211    * a register file. The direct index now becomes an offset
1212    * that is being added to the indirect register.
1213    *
1214    *    file[ind[2].x+1],
1215    *    where:
1216    *       ind = Indirect.File
1217    *       [2] = Indirect.Index
1218    *       .x = Indirect.SwizzleX
1219    */
1220   if (reg->Register.Indirect) {
1221      union tgsi_exec_channel index2;
1222      union tgsi_exec_channel indir_index;
1223      const uint execmask = mach->ExecMask;
1224      uint i;
1225
1226      /* which address register (always zero now) */
1227      index2.i[0] =
1228      index2.i[1] =
1229      index2.i[2] =
1230      index2.i[3] = reg->Indirect.Index;
1231      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1232      /* get current value of address register[swizzle] */
1233      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1234      fetch_src_file_channel(mach,
1235                             chan_index,
1236                             reg->Indirect.File,
1237                             swizzle,
1238                             &index2,
1239                             &ZeroVec,
1240                             &indir_index);
1241
1242      /* add value of address register to the offset */
1243      index.i[0] += indir_index.i[0];
1244      index.i[1] += indir_index.i[1];
1245      index.i[2] += indir_index.i[2];
1246      index.i[3] += indir_index.i[3];
1247
1248      /* for disabled execution channels, zero-out the index to
1249       * avoid using a potential garbage value.
1250       */
1251      for (i = 0; i < QUAD_SIZE; i++) {
1252         if ((execmask & (1 << i)) == 0)
1253            index.i[i] = 0;
1254      }
1255   }
1256
1257   /* There is an extra source register that is a second
1258    * subscript to a register file. Effectively it means that
1259    * the register file is actually a 2D array of registers.
1260    *
1261    *    file[3][1],
1262    *    where:
1263    *       [3] = Dimension.Index
1264    */
1265   if (reg->Register.Dimension) {
1266      index2D.i[0] =
1267      index2D.i[1] =
1268      index2D.i[2] =
1269      index2D.i[3] = reg->Dimension.Index;
1270
1271      /* Again, the second subscript index can be addressed indirectly
1272       * identically to the first one.
1273       * Nothing stops us from indirectly addressing the indirect register,
1274       * but there is no need for that, so we won't exercise it.
1275       *
1276       *    file[ind[4].y+3][1],
1277       *    where:
1278       *       ind = DimIndirect.File
1279       *       [4] = DimIndirect.Index
1280       *       .y = DimIndirect.SwizzleX
1281       */
1282      if (reg->Dimension.Indirect) {
1283         union tgsi_exec_channel index2;
1284         union tgsi_exec_channel indir_index;
1285         const uint execmask = mach->ExecMask;
1286         uint i;
1287
1288         index2.i[0] =
1289         index2.i[1] =
1290         index2.i[2] =
1291         index2.i[3] = reg->DimIndirect.Index;
1292
1293         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1294         fetch_src_file_channel(mach,
1295                                chan_index,
1296                                reg->DimIndirect.File,
1297                                swizzle,
1298                                &index2,
1299                                &ZeroVec,
1300                                &indir_index);
1301
1302         index2D.i[0] += indir_index.i[0];
1303         index2D.i[1] += indir_index.i[1];
1304         index2D.i[2] += indir_index.i[2];
1305         index2D.i[3] += indir_index.i[3];
1306
1307         /* for disabled execution channels, zero-out the index to
1308          * avoid using a potential garbage value.
1309          */
1310         for (i = 0; i < QUAD_SIZE; i++) {
1311            if ((execmask & (1 << i)) == 0) {
1312               index2D.i[i] = 0;
1313            }
1314         }
1315      }
1316
1317      /* If by any chance there was a need for a 3D array of register
1318       * files, we would have to check whether Dimension is followed
1319       * by a dimension register and continue the saga.
1320       */
1321   } else {
1322      index2D.i[0] =
1323      index2D.i[1] =
1324      index2D.i[2] =
1325      index2D.i[3] = 0;
1326   }
1327
1328   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1329   fetch_src_file_channel(mach,
1330                          chan_index,
1331                          reg->Register.File,
1332                          swizzle,
1333                          &index,
1334                          &index2D,
1335                          chan);
1336
1337   if (reg->Register.Absolute) {
1338      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1339         micro_abs(chan, chan);
1340      } else {
1341         micro_iabs(chan, chan);
1342      }
1343   }
1344
1345   if (reg->Register.Negate) {
1346      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1347         micro_neg(chan, chan);
1348      } else {
1349         micro_ineg(chan, chan);
1350      }
1351   }
1352}
1353
1354static void
1355store_dest(struct tgsi_exec_machine *mach,
1356           const union tgsi_exec_channel *chan,
1357           const struct tgsi_full_dst_register *reg,
1358           const struct tgsi_full_instruction *inst,
1359           uint chan_index,
1360           enum tgsi_exec_datatype dst_datatype)
1361{
1362   uint i;
1363   union tgsi_exec_channel null;
1364   union tgsi_exec_channel *dst;
1365   union tgsi_exec_channel index2D;
1366   uint execmask = mach->ExecMask;
1367   int offset = 0;  /* indirection offset */
1368   int index;
1369
1370   /* for debugging */
1371   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1372      check_inf_or_nan(chan);
1373   }
1374
1375   /* There is an extra source register that indirectly subscripts
1376    * a register file. The direct index now becomes an offset
1377    * that is being added to the indirect register.
1378    *
1379    *    file[ind[2].x+1],
1380    *    where:
1381    *       ind = Indirect.File
1382    *       [2] = Indirect.Index
1383    *       .x = Indirect.SwizzleX
1384    */
1385   if (reg->Register.Indirect) {
1386      union tgsi_exec_channel index;
1387      union tgsi_exec_channel indir_index;
1388      uint swizzle;
1389
1390      /* which address register (always zero for now) */
1391      index.i[0] =
1392      index.i[1] =
1393      index.i[2] =
1394      index.i[3] = reg->Indirect.Index;
1395
1396      /* get current value of address register[swizzle] */
1397      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1398
1399      /* fetch values from the address/indirection register */
1400      fetch_src_file_channel(mach,
1401                             chan_index,
1402                             reg->Indirect.File,
1403                             swizzle,
1404                             &index,
1405                             &ZeroVec,
1406                             &indir_index);
1407
1408      /* save indirection offset */
1409      offset = indir_index.i[0];
1410   }
1411
1412   /* There is an extra source register that is a second
1413    * subscript to a register file. Effectively it means that
1414    * the register file is actually a 2D array of registers.
1415    *
1416    *    file[3][1],
1417    *    where:
1418    *       [3] = Dimension.Index
1419    */
1420   if (reg->Register.Dimension) {
1421      index2D.i[0] =
1422      index2D.i[1] =
1423      index2D.i[2] =
1424      index2D.i[3] = reg->Dimension.Index;
1425
1426      /* Again, the second subscript index can be addressed indirectly
1427       * identically to the first one.
1428       * Nothing stops us from indirectly addressing the indirect register,
1429       * but there is no need for that, so we won't exercise it.
1430       *
1431       *    file[ind[4].y+3][1],
1432       *    where:
1433       *       ind = DimIndirect.File
1434       *       [4] = DimIndirect.Index
1435       *       .y = DimIndirect.SwizzleX
1436       */
1437      if (reg->Dimension.Indirect) {
1438         union tgsi_exec_channel index2;
1439         union tgsi_exec_channel indir_index;
1440         const uint execmask = mach->ExecMask;
1441         unsigned swizzle;
1442         uint i;
1443
1444         index2.i[0] =
1445         index2.i[1] =
1446         index2.i[2] =
1447         index2.i[3] = reg->DimIndirect.Index;
1448
1449         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1450         fetch_src_file_channel(mach,
1451                                chan_index,
1452                                reg->DimIndirect.File,
1453                                swizzle,
1454                                &index2,
1455                                &ZeroVec,
1456                                &indir_index);
1457
1458         index2D.i[0] += indir_index.i[0];
1459         index2D.i[1] += indir_index.i[1];
1460         index2D.i[2] += indir_index.i[2];
1461         index2D.i[3] += indir_index.i[3];
1462
1463         /* for disabled execution channels, zero-out the index to
1464          * avoid using a potential garbage value.
1465          */
1466         for (i = 0; i < QUAD_SIZE; i++) {
1467            if ((execmask & (1 << i)) == 0) {
1468               index2D.i[i] = 0;
1469            }
1470         }
1471      }
1472
1473      /* If by any chance there was a need for a 3D array of register
1474       * files, we would have to check whether Dimension is followed
1475       * by a dimension register and continue the saga.
1476       */
1477   } else {
1478      index2D.i[0] =
1479      index2D.i[1] =
1480      index2D.i[2] =
1481      index2D.i[3] = 0;
1482   }
1483
1484   switch (reg->Register.File) {
1485   case TGSI_FILE_NULL:
1486      dst = &null;
1487      break;
1488
1489   case TGSI_FILE_OUTPUT:
1490      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1491         + reg->Register.Index;
1492      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1493#if 0
1494      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1495         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1496         for (i = 0; i < QUAD_SIZE; i++)
1497            if (execmask & (1 << i))
1498               fprintf(stderr, "%f, ", chan->f[i]);
1499         fprintf(stderr, ")\n");
1500      }
1501#endif
1502      break;
1503
1504   case TGSI_FILE_TEMPORARY:
1505      index = reg->Register.Index;
1506      assert( index < TGSI_EXEC_NUM_TEMPS );
1507      dst = &mach->Temps[offset + index].xyzw[chan_index];
1508      break;
1509
1510   case TGSI_FILE_TEMPORARY_ARRAY:
1511      index = reg->Register.Index;
1512      assert( index < TGSI_EXEC_NUM_TEMPS );
1513      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1514      /* XXX we use index2D.i[0] here but somehow we might
1515       * end up with someone trying to store indirectly in
1516       * different buffers */
1517      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1518      break;
1519
1520   case TGSI_FILE_ADDRESS:
1521      index = reg->Register.Index;
1522      dst = &mach->Addrs[index].xyzw[chan_index];
1523      break;
1524
1525   case TGSI_FILE_PREDICATE:
1526      index = reg->Register.Index;
1527      assert(index < TGSI_EXEC_NUM_PREDS);
1528      dst = &mach->Predicates[index].xyzw[chan_index];
1529      break;
1530
1531   default:
1532      assert( 0 );
1533      return;
1534   }
1535
1536   if (inst->Instruction.Predicate) {
1537      uint swizzle;
1538      union tgsi_exec_channel *pred;
1539
1540      switch (chan_index) {
1541      case CHAN_X:
1542         swizzle = inst->Predicate.SwizzleX;
1543         break;
1544      case CHAN_Y:
1545         swizzle = inst->Predicate.SwizzleY;
1546         break;
1547      case CHAN_Z:
1548         swizzle = inst->Predicate.SwizzleZ;
1549         break;
1550      case CHAN_W:
1551         swizzle = inst->Predicate.SwizzleW;
1552         break;
1553      default:
1554         assert(0);
1555         return;
1556      }
1557
1558      assert(inst->Predicate.Index == 0);
1559
1560      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1561
1562      if (inst->Predicate.Negate) {
1563         for (i = 0; i < QUAD_SIZE; i++) {
1564            if (pred->u[i]) {
1565               execmask &= ~(1 << i);
1566            }
1567         }
1568      } else {
1569         for (i = 0; i < QUAD_SIZE; i++) {
1570            if (!pred->u[i]) {
1571               execmask &= ~(1 << i);
1572            }
1573         }
1574      }
1575   }
1576
1577   switch (inst->Instruction.Saturate) {
1578   case TGSI_SAT_NONE:
1579      for (i = 0; i < QUAD_SIZE; i++)
1580         if (execmask & (1 << i))
1581            dst->i[i] = chan->i[i];
1582      break;
1583
1584   case TGSI_SAT_ZERO_ONE:
1585      for (i = 0; i < QUAD_SIZE; i++)
1586         if (execmask & (1 << i)) {
1587            if (chan->f[i] < 0.0f)
1588               dst->f[i] = 0.0f;
1589            else if (chan->f[i] > 1.0f)
1590               dst->f[i] = 1.0f;
1591            else
1592               dst->i[i] = chan->i[i];
1593         }
1594      break;
1595
1596   case TGSI_SAT_MINUS_PLUS_ONE:
1597      for (i = 0; i < QUAD_SIZE; i++)
1598         if (execmask & (1 << i)) {
1599            if (chan->f[i] < -1.0f)
1600               dst->f[i] = -1.0f;
1601            else if (chan->f[i] > 1.0f)
1602               dst->f[i] = 1.0f;
1603            else
1604               dst->i[i] = chan->i[i];
1605         }
1606      break;
1607
1608   default:
1609      assert( 0 );
1610   }
1611}
1612
1613#define FETCH(VAL,INDEX,CHAN)\
1614    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1615
1616#define IFETCH(VAL,INDEX,CHAN)\
1617    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1618
1619
1620/**
1621 * Execute ARB-style KIL which is predicated by a src register.
1622 * Kill fragment if any of the four values is less than zero.
1623 */
1624static void
1625exec_kil(struct tgsi_exec_machine *mach,
1626         const struct tgsi_full_instruction *inst)
1627{
1628   uint uniquemask;
1629   uint chan_index;
1630   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1631   union tgsi_exec_channel r[1];
1632
1633   /* This mask stores component bits that were already tested. */
1634   uniquemask = 0;
1635
1636   for (chan_index = 0; chan_index < 4; chan_index++)
1637   {
1638      uint swizzle;
1639      uint i;
1640
1641      /* unswizzle channel */
1642      swizzle = tgsi_util_get_full_src_register_swizzle (
1643                        &inst->Src[0],
1644                        chan_index);
1645
1646      /* check if the component has not been already tested */
1647      if (uniquemask & (1 << swizzle))
1648         continue;
1649      uniquemask |= 1 << swizzle;
1650
1651      FETCH(&r[0], 0, chan_index);
1652      for (i = 0; i < 4; i++)
1653         if (r[0].f[i] < 0.0f)
1654            kilmask |= 1 << i;
1655   }
1656
1657   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1658}
1659
1660/**
1661 * Execute NVIDIA-style KIL which is predicated by a condition code.
1662 * Kill fragment if the condition code is TRUE.
1663 */
1664static void
1665exec_kilp(struct tgsi_exec_machine *mach,
1666          const struct tgsi_full_instruction *inst)
1667{
1668   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1669
1670   /* "unconditional" kil */
1671   kilmask = mach->ExecMask;
1672   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1673}
1674
1675static void
1676emit_vertex(struct tgsi_exec_machine *mach)
1677{
1678   /* FIXME: check for exec mask correctly
1679   unsigned i;
1680   for (i = 0; i < QUAD_SIZE; ++i) {
1681         if ((mach->ExecMask & (1 << i)))
1682   */
1683   if (mach->ExecMask) {
1684      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1685      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1686   }
1687}
1688
1689static void
1690emit_primitive(struct tgsi_exec_machine *mach)
1691{
1692   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1693   /* FIXME: check for exec mask correctly
1694   unsigned i;
1695   for (i = 0; i < QUAD_SIZE; ++i) {
1696         if ((mach->ExecMask & (1 << i)))
1697   */
1698   if (mach->ExecMask) {
1699      ++(*prim_count);
1700      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1701      mach->Primitives[*prim_count] = 0;
1702   }
1703}
1704
1705static void
1706conditional_emit_primitive(struct tgsi_exec_machine *mach)
1707{
1708   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1709      int emitted_verts =
1710         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1711      if (emitted_verts) {
1712         emit_primitive(mach);
1713      }
1714   }
1715}
1716
1717
1718/*
1719 * Fetch four texture samples using STR texture coordinates.
1720 */
1721static void
1722fetch_texel( struct tgsi_sampler *sampler,
1723             const union tgsi_exec_channel *s,
1724             const union tgsi_exec_channel *t,
1725             const union tgsi_exec_channel *p,
1726             const union tgsi_exec_channel *c0,
1727             enum tgsi_sampler_control control,
1728             union tgsi_exec_channel *r,
1729             union tgsi_exec_channel *g,
1730             union tgsi_exec_channel *b,
1731             union tgsi_exec_channel *a )
1732{
1733   uint j;
1734   float rgba[NUM_CHANNELS][QUAD_SIZE];
1735
1736   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1737
1738   for (j = 0; j < 4; j++) {
1739      r->f[j] = rgba[0][j];
1740      g->f[j] = rgba[1][j];
1741      b->f[j] = rgba[2][j];
1742      a->f[j] = rgba[3][j];
1743   }
1744}
1745
1746
1747#define TEX_MODIFIER_NONE           0
1748#define TEX_MODIFIER_PROJECTED      1
1749#define TEX_MODIFIER_LOD_BIAS       2
1750#define TEX_MODIFIER_EXPLICIT_LOD   3
1751
1752
1753static void
1754exec_tex(struct tgsi_exec_machine *mach,
1755         const struct tgsi_full_instruction *inst,
1756         uint modifier)
1757{
1758   const uint unit = inst->Src[1].Register.Index;
1759   union tgsi_exec_channel r[4];
1760   const union tgsi_exec_channel *lod = &ZeroVec;
1761   enum tgsi_sampler_control control;
1762   uint chan;
1763
1764   if (modifier != TEX_MODIFIER_NONE) {
1765      FETCH(&r[3], 0, CHAN_W);
1766      if (modifier != TEX_MODIFIER_PROJECTED) {
1767         lod = &r[3];
1768      }
1769   }
1770
1771   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1772      control = tgsi_sampler_lod_explicit;
1773   } else {
1774      control = tgsi_sampler_lod_bias;
1775   }
1776
1777   switch (inst->Texture.Texture) {
1778   case TGSI_TEXTURE_1D:
1779      FETCH(&r[0], 0, CHAN_X);
1780
1781      if (modifier == TEX_MODIFIER_PROJECTED) {
1782         micro_div(&r[0], &r[0], &r[3]);
1783      }
1784
1785      fetch_texel(mach->Samplers[unit],
1786                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1787                  control,
1788                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1789      break;
1790   case TGSI_TEXTURE_SHADOW1D:
1791      FETCH(&r[0], 0, CHAN_X);
1792      FETCH(&r[2], 0, CHAN_Z);
1793
1794      if (modifier == TEX_MODIFIER_PROJECTED) {
1795         micro_div(&r[0], &r[0], &r[3]);
1796      }
1797
1798      fetch_texel(mach->Samplers[unit],
1799                  &r[0], &ZeroVec, &r[2], lod,  /* S, T, P, LOD */
1800                  control,
1801                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1802      break;
1803
1804   case TGSI_TEXTURE_2D:
1805   case TGSI_TEXTURE_RECT:
1806   case TGSI_TEXTURE_SHADOW2D:
1807   case TGSI_TEXTURE_SHADOWRECT:
1808      FETCH(&r[0], 0, CHAN_X);
1809      FETCH(&r[1], 0, CHAN_Y);
1810      FETCH(&r[2], 0, CHAN_Z);
1811
1812      if (modifier == TEX_MODIFIER_PROJECTED) {
1813         micro_div(&r[0], &r[0], &r[3]);
1814         micro_div(&r[1], &r[1], &r[3]);
1815         micro_div(&r[2], &r[2], &r[3]);
1816      }
1817
1818      fetch_texel(mach->Samplers[unit],
1819                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1820                  control,
1821                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1822      break;
1823
1824   case TGSI_TEXTURE_1D_ARRAY:
1825      FETCH(&r[0], 0, CHAN_X);
1826      FETCH(&r[1], 0, CHAN_Y);
1827
1828      if (modifier == TEX_MODIFIER_PROJECTED) {
1829         micro_div(&r[0], &r[0], &r[3]);
1830      }
1831
1832      fetch_texel(mach->Samplers[unit],
1833                  &r[0], &r[1], &ZeroVec, lod,     /* S, T, P, LOD */
1834                  control,
1835                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1836      break;
1837   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1838      FETCH(&r[0], 0, CHAN_X);
1839      FETCH(&r[1], 0, CHAN_Y);
1840      FETCH(&r[2], 0, CHAN_Z);
1841
1842      if (modifier == TEX_MODIFIER_PROJECTED) {
1843         micro_div(&r[0], &r[0], &r[3]);
1844      }
1845
1846      fetch_texel(mach->Samplers[unit],
1847                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1848                  control,
1849                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1850      break;
1851
1852   case TGSI_TEXTURE_2D_ARRAY:
1853      FETCH(&r[0], 0, CHAN_X);
1854      FETCH(&r[1], 0, CHAN_Y);
1855      FETCH(&r[2], 0, CHAN_Z);
1856
1857      if (modifier == TEX_MODIFIER_PROJECTED) {
1858         micro_div(&r[0], &r[0], &r[3]);
1859         micro_div(&r[1], &r[1], &r[3]);
1860      }
1861
1862      fetch_texel(mach->Samplers[unit],
1863                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1864                  control,
1865                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1866      break;
1867   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1868      FETCH(&r[0], 0, CHAN_X);
1869      FETCH(&r[1], 0, CHAN_Y);
1870      FETCH(&r[2], 0, CHAN_Z);
1871      FETCH(&r[3], 0, CHAN_W);
1872
1873      fetch_texel(mach->Samplers[unit],
1874                  &r[0], &r[1], &r[2], &r[3],     /* S, T, P, LOD */
1875                  control,
1876                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1877      break;
1878   case TGSI_TEXTURE_3D:
1879   case TGSI_TEXTURE_CUBE:
1880      FETCH(&r[0], 0, CHAN_X);
1881      FETCH(&r[1], 0, CHAN_Y);
1882      FETCH(&r[2], 0, CHAN_Z);
1883
1884      if (modifier == TEX_MODIFIER_PROJECTED) {
1885         micro_div(&r[0], &r[0], &r[3]);
1886         micro_div(&r[1], &r[1], &r[3]);
1887         micro_div(&r[2], &r[2], &r[3]);
1888      }
1889
1890      fetch_texel(mach->Samplers[unit],
1891                  &r[0], &r[1], &r[2], lod,
1892                  control,
1893                  &r[0], &r[1], &r[2], &r[3]);
1894      break;
1895
1896   default:
1897      assert(0);
1898   }
1899
1900#if 0
1901   debug_printf("fetch r: %g %g %g %g\n",
1902         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
1903   debug_printf("fetch g: %g %g %g %g\n",
1904         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
1905   debug_printf("fetch b: %g %g %g %g\n",
1906         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
1907   debug_printf("fetch a: %g %g %g %g\n",
1908         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
1909#endif
1910
1911   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1912      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1913         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1914      }
1915   }
1916}
1917
1918static void
1919exec_txd(struct tgsi_exec_machine *mach,
1920         const struct tgsi_full_instruction *inst)
1921{
1922   const uint unit = inst->Src[3].Register.Index;
1923   union tgsi_exec_channel r[4];
1924   uint chan;
1925
1926   /*
1927    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1928    */
1929
1930   switch (inst->Texture.Texture) {
1931   case TGSI_TEXTURE_1D:
1932   case TGSI_TEXTURE_SHADOW1D:
1933
1934      FETCH(&r[0], 0, CHAN_X);
1935
1936      fetch_texel(mach->Samplers[unit],
1937                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1938                  tgsi_sampler_lod_bias,
1939                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1940      break;
1941
1942   case TGSI_TEXTURE_1D_ARRAY:
1943   case TGSI_TEXTURE_2D:
1944   case TGSI_TEXTURE_RECT:
1945   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1946   case TGSI_TEXTURE_SHADOW2D:
1947   case TGSI_TEXTURE_SHADOWRECT:
1948
1949      FETCH(&r[0], 0, CHAN_X);
1950      FETCH(&r[1], 0, CHAN_Y);
1951      FETCH(&r[2], 0, CHAN_Z);
1952
1953      fetch_texel(mach->Samplers[unit],
1954                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1955                  tgsi_sampler_lod_bias,
1956                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1957      break;
1958
1959   case TGSI_TEXTURE_2D_ARRAY:
1960   case TGSI_TEXTURE_3D:
1961   case TGSI_TEXTURE_CUBE:
1962
1963      FETCH(&r[0], 0, CHAN_X);
1964      FETCH(&r[1], 0, CHAN_Y);
1965      FETCH(&r[2], 0, CHAN_Z);
1966
1967      fetch_texel(mach->Samplers[unit],
1968                  &r[0], &r[1], &r[2], &ZeroVec,
1969                  tgsi_sampler_lod_bias,
1970                  &r[0], &r[1], &r[2], &r[3]);
1971      break;
1972
1973   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1974
1975      FETCH(&r[0], 0, CHAN_X);
1976      FETCH(&r[1], 0, CHAN_Y);
1977      FETCH(&r[2], 0, CHAN_Z);
1978      FETCH(&r[3], 0, CHAN_W);
1979
1980      fetch_texel(mach->Samplers[unit],
1981                  &r[0], &r[1], &r[2], &r[3],
1982                  tgsi_sampler_lod_bias,
1983                  &r[0], &r[1], &r[2], &r[3]);
1984      break;
1985
1986   default:
1987      assert(0);
1988   }
1989
1990   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1991      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1992         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1993      }
1994   }
1995}
1996
1997
1998static void
1999exec_txf(struct tgsi_exec_machine *mach,
2000	 const struct tgsi_full_instruction *inst)
2001{
2002   struct tgsi_sampler *sampler;
2003   const uint unit = inst->Src[2].Register.Index;
2004   union tgsi_exec_channel r[4];
2005   union tgsi_exec_channel offset[3];
2006   uint chan;
2007   float rgba[NUM_CHANNELS][QUAD_SIZE];
2008   int j;
2009   int8_t offsets[3];
2010
2011   if (inst->Texture.NumOffsets == 1) {
2012      union tgsi_exec_channel index;
2013      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2014      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2015                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2016      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2017                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2018      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2019                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2020     offsets[0] = offset[0].i[0];
2021     offsets[1] = offset[1].i[0];
2022     offsets[2] = offset[2].i[0];
2023   } else
2024     offsets[0] = offsets[1] = offsets[2] = 0;
2025
2026   IFETCH(&r[3], 0, CHAN_W);
2027
2028   switch(inst->Texture.Texture) {
2029   case TGSI_TEXTURE_3D:
2030   case TGSI_TEXTURE_2D_ARRAY:
2031   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2032      IFETCH(&r[2], 0, CHAN_Z);
2033      /* fallthrough */
2034   case TGSI_TEXTURE_2D:
2035   case TGSI_TEXTURE_RECT:
2036   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2037   case TGSI_TEXTURE_SHADOW2D:
2038   case TGSI_TEXTURE_SHADOWRECT:
2039   case TGSI_TEXTURE_1D_ARRAY:
2040      IFETCH(&r[1], 0, CHAN_Y);
2041      /* fallthrough */
2042   case TGSI_TEXTURE_1D:
2043   case TGSI_TEXTURE_SHADOW1D:
2044      IFETCH(&r[0], 0, CHAN_X);
2045      break;
2046   default:
2047      assert(0);
2048      break;
2049   }
2050
2051   sampler = mach->Samplers[unit];
2052   sampler->get_texel(sampler, r[0].i, r[1].i, r[2].i, r[3].i,
2053		      offsets, rgba);
2054
2055   for (j = 0; j < QUAD_SIZE; j++) {
2056      r[0].f[j] = rgba[0][j];
2057      r[1].f[j] = rgba[1][j];
2058      r[2].f[j] = rgba[2][j];
2059      r[3].f[j] = rgba[3][j];
2060   }
2061
2062   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2063      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2064         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2065      }
2066   }
2067}
2068
2069static void
2070exec_txq(struct tgsi_exec_machine *mach,
2071         const struct tgsi_full_instruction *inst)
2072{
2073   struct tgsi_sampler *sampler;
2074   const uint unit = inst->Src[1].Register.Index;
2075   int result[4];
2076   union tgsi_exec_channel r[4], src;
2077   uint chan;
2078   int i,j;
2079
2080   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_INT);
2081   sampler = mach->Samplers[unit];
2082
2083   sampler->get_dims(sampler, src.i[0], result);
2084
2085   for (i = 0; i < QUAD_SIZE; i++) {
2086      for (j = 0; j < 4; j++) {
2087	 r[j].i[i] = result[j];
2088      }
2089   }
2090
2091   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2092      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2093	 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2094		    TGSI_EXEC_DATA_INT);
2095      }
2096   }
2097}
2098
2099static void
2100exec_sample(struct tgsi_exec_machine *mach,
2101            const struct tgsi_full_instruction *inst,
2102            uint modifier)
2103{
2104   const uint resource_unit = inst->Src[1].Register.Index;
2105   const uint sampler_unit = inst->Src[2].Register.Index;
2106   union tgsi_exec_channel r[4];
2107   const union tgsi_exec_channel *lod = &ZeroVec;
2108   enum tgsi_sampler_control control;
2109   uint chan;
2110
2111   if (modifier != TEX_MODIFIER_NONE) {
2112      if (modifier == TEX_MODIFIER_LOD_BIAS)
2113         FETCH(&r[3], 3, CHAN_X);
2114      else /*TEX_MODIFIER_LOD*/
2115         FETCH(&r[3], 0, CHAN_W);
2116
2117      if (modifier != TEX_MODIFIER_PROJECTED) {
2118         lod = &r[3];
2119      }
2120   }
2121
2122   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2123      control = tgsi_sampler_lod_explicit;
2124   } else {
2125      control = tgsi_sampler_lod_bias;
2126   }
2127
2128   switch (mach->Resources[resource_unit].Resource) {
2129   case TGSI_TEXTURE_1D:
2130   case TGSI_TEXTURE_SHADOW1D:
2131      FETCH(&r[0], 0, CHAN_X);
2132
2133      if (modifier == TEX_MODIFIER_PROJECTED) {
2134         micro_div(&r[0], &r[0], &r[3]);
2135      }
2136
2137      fetch_texel(mach->Samplers[sampler_unit],
2138                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
2139                  control,
2140                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2141      break;
2142
2143   case TGSI_TEXTURE_1D_ARRAY:
2144   case TGSI_TEXTURE_2D:
2145   case TGSI_TEXTURE_RECT:
2146   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2147   case TGSI_TEXTURE_SHADOW2D:
2148   case TGSI_TEXTURE_SHADOWRECT:
2149      FETCH(&r[0], 0, CHAN_X);
2150      FETCH(&r[1], 0, CHAN_Y);
2151      FETCH(&r[2], 0, CHAN_Z);
2152
2153      if (modifier == TEX_MODIFIER_PROJECTED) {
2154         micro_div(&r[0], &r[0], &r[3]);
2155         micro_div(&r[1], &r[1], &r[3]);
2156         micro_div(&r[2], &r[2], &r[3]);
2157      }
2158
2159      fetch_texel(mach->Samplers[sampler_unit],
2160                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
2161                  control,
2162                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2163      break;
2164
2165   case TGSI_TEXTURE_2D_ARRAY:
2166   case TGSI_TEXTURE_3D:
2167   case TGSI_TEXTURE_CUBE:
2168      FETCH(&r[0], 0, CHAN_X);
2169      FETCH(&r[1], 0, CHAN_Y);
2170      FETCH(&r[2], 0, CHAN_Z);
2171
2172      if (modifier == TEX_MODIFIER_PROJECTED) {
2173         micro_div(&r[0], &r[0], &r[3]);
2174         micro_div(&r[1], &r[1], &r[3]);
2175         micro_div(&r[2], &r[2], &r[3]);
2176      }
2177
2178      fetch_texel(mach->Samplers[sampler_unit],
2179                  &r[0], &r[1], &r[2], lod,
2180                  control,
2181                  &r[0], &r[1], &r[2], &r[3]);
2182      break;
2183
2184   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2185      FETCH(&r[0], 0, CHAN_X);
2186      FETCH(&r[1], 0, CHAN_Y);
2187      FETCH(&r[2], 0, CHAN_Z);
2188      FETCH(&r[3], 0, CHAN_W);
2189
2190      assert(modifier != TEX_MODIFIER_PROJECTED);
2191
2192      fetch_texel(mach->Samplers[sampler_unit],
2193                  &r[0], &r[1], &r[2], &r[3],
2194                  control,
2195                  &r[0], &r[1], &r[2], &r[3]);
2196      break;
2197
2198   default:
2199      assert(0);
2200   }
2201
2202   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2203      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2204         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2205      }
2206   }
2207}
2208
2209static void
2210exec_sample_d(struct tgsi_exec_machine *mach,
2211              const struct tgsi_full_instruction *inst)
2212{
2213   const uint resource_unit = inst->Src[1].Register.Index;
2214   const uint sampler_unit = inst->Src[2].Register.Index;
2215   union tgsi_exec_channel r[4];
2216   uint chan;
2217   /*
2218    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
2219    */
2220
2221   switch (mach->Resources[resource_unit].Resource) {
2222   case TGSI_TEXTURE_1D:
2223   case TGSI_TEXTURE_SHADOW1D:
2224
2225      FETCH(&r[0], 0, CHAN_X);
2226
2227      fetch_texel(mach->Samplers[sampler_unit],
2228                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
2229                  tgsi_sampler_lod_bias,
2230                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2231      break;
2232
2233   case TGSI_TEXTURE_2D:
2234   case TGSI_TEXTURE_RECT:
2235   case TGSI_TEXTURE_SHADOW2D:
2236   case TGSI_TEXTURE_SHADOWRECT:
2237
2238      FETCH(&r[0], 0, CHAN_X);
2239      FETCH(&r[1], 0, CHAN_Y);
2240      FETCH(&r[2], 0, CHAN_Z);
2241
2242      fetch_texel(mach->Samplers[sampler_unit],
2243                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2244                  tgsi_sampler_lod_bias,
2245                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2246      break;
2247
2248   case TGSI_TEXTURE_3D:
2249   case TGSI_TEXTURE_CUBE:
2250
2251      FETCH(&r[0], 0, CHAN_X);
2252      FETCH(&r[1], 0, CHAN_Y);
2253      FETCH(&r[2], 0, CHAN_Z);
2254
2255      fetch_texel(mach->Samplers[sampler_unit],
2256                  &r[0], &r[1], &r[2], &ZeroVec,
2257                  tgsi_sampler_lod_bias,
2258                  &r[0], &r[1], &r[2], &r[3]);
2259      break;
2260
2261   default:
2262      assert(0);
2263   }
2264
2265   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2266      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2267         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2268      }
2269   }
2270}
2271
2272
2273/**
2274 * Evaluate a constant-valued coefficient at the position of the
2275 * current quad.
2276 */
2277static void
2278eval_constant_coef(
2279   struct tgsi_exec_machine *mach,
2280   unsigned attrib,
2281   unsigned chan )
2282{
2283   unsigned i;
2284
2285   for( i = 0; i < QUAD_SIZE; i++ ) {
2286      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2287   }
2288}
2289
2290/**
2291 * Evaluate a linear-valued coefficient at the position of the
2292 * current quad.
2293 */
2294static void
2295eval_linear_coef(
2296   struct tgsi_exec_machine *mach,
2297   unsigned attrib,
2298   unsigned chan )
2299{
2300   const float x = mach->QuadPos.xyzw[0].f[0];
2301   const float y = mach->QuadPos.xyzw[1].f[0];
2302   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2303   const float dady = mach->InterpCoefs[attrib].dady[chan];
2304   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2305   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2306   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2307   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2308   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2309}
2310
2311/**
2312 * Evaluate a perspective-valued coefficient at the position of the
2313 * current quad.
2314 */
2315static void
2316eval_perspective_coef(
2317   struct tgsi_exec_machine *mach,
2318   unsigned attrib,
2319   unsigned chan )
2320{
2321   const float x = mach->QuadPos.xyzw[0].f[0];
2322   const float y = mach->QuadPos.xyzw[1].f[0];
2323   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2324   const float dady = mach->InterpCoefs[attrib].dady[chan];
2325   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2326   const float *w = mach->QuadPos.xyzw[3].f;
2327   /* divide by W here */
2328   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2329   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2330   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2331   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2332}
2333
2334
2335typedef void (* eval_coef_func)(
2336   struct tgsi_exec_machine *mach,
2337   unsigned attrib,
2338   unsigned chan );
2339
2340static void
2341exec_declaration(struct tgsi_exec_machine *mach,
2342                 const struct tgsi_full_declaration *decl)
2343{
2344   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
2345      mach->Resources[decl->Range.First] = decl->Resource;
2346      return;
2347   }
2348
2349   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2350      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2351         uint first, last, mask;
2352
2353         first = decl->Range.First;
2354         last = decl->Range.Last;
2355         mask = decl->Declaration.UsageMask;
2356
2357         /* XXX we could remove this special-case code since
2358          * mach->InterpCoefs[first].a0 should already have the
2359          * front/back-face value.  But we should first update the
2360          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2361          * Then, we could remove the tgsi_exec_machine::Face field.
2362          */
2363         /* XXX make FACE a system value */
2364         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2365            uint i;
2366
2367            assert(decl->Semantic.Index == 0);
2368            assert(first == last);
2369
2370            for (i = 0; i < QUAD_SIZE; i++) {
2371               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2372            }
2373         } else {
2374            eval_coef_func eval;
2375            uint i, j;
2376
2377            switch (decl->Declaration.Interpolate) {
2378            case TGSI_INTERPOLATE_CONSTANT:
2379               eval = eval_constant_coef;
2380               break;
2381
2382            case TGSI_INTERPOLATE_LINEAR:
2383               eval = eval_linear_coef;
2384               break;
2385
2386            case TGSI_INTERPOLATE_PERSPECTIVE:
2387               eval = eval_perspective_coef;
2388               break;
2389
2390            case TGSI_INTERPOLATE_COLOR:
2391               eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
2392               break;
2393
2394            default:
2395               assert(0);
2396               return;
2397            }
2398
2399            for (j = 0; j < NUM_CHANNELS; j++) {
2400               if (mask & (1 << j)) {
2401                  for (i = first; i <= last; i++) {
2402                     eval(mach, i, j);
2403                  }
2404               }
2405            }
2406         }
2407      }
2408   }
2409
2410   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2411      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2412   }
2413}
2414
2415
2416typedef void (* micro_op)(union tgsi_exec_channel *dst);
2417
2418static void
2419exec_vector(struct tgsi_exec_machine *mach,
2420            const struct tgsi_full_instruction *inst,
2421            micro_op op,
2422            enum tgsi_exec_datatype dst_datatype)
2423{
2424   unsigned int chan;
2425
2426   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2427      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2428         union tgsi_exec_channel dst;
2429
2430         op(&dst);
2431         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2432      }
2433   }
2434}
2435
2436typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2437                                const union tgsi_exec_channel *src);
2438
2439static void
2440exec_scalar_unary(struct tgsi_exec_machine *mach,
2441                  const struct tgsi_full_instruction *inst,
2442                  micro_unary_op op,
2443                  enum tgsi_exec_datatype dst_datatype,
2444                  enum tgsi_exec_datatype src_datatype)
2445{
2446   unsigned int chan;
2447   union tgsi_exec_channel src;
2448   union tgsi_exec_channel dst;
2449
2450   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
2451   op(&dst, &src);
2452   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2453      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2454         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2455      }
2456   }
2457}
2458
2459static void
2460exec_vector_unary(struct tgsi_exec_machine *mach,
2461                  const struct tgsi_full_instruction *inst,
2462                  micro_unary_op op,
2463                  enum tgsi_exec_datatype dst_datatype,
2464                  enum tgsi_exec_datatype src_datatype)
2465{
2466   unsigned int chan;
2467   struct tgsi_exec_vector dst;
2468
2469   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2470      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2471         union tgsi_exec_channel src;
2472
2473         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2474         op(&dst.xyzw[chan], &src);
2475      }
2476   }
2477   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2478      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2479         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2480      }
2481   }
2482}
2483
2484typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2485                                 const union tgsi_exec_channel *src0,
2486                                 const union tgsi_exec_channel *src1);
2487
2488static void
2489exec_scalar_binary(struct tgsi_exec_machine *mach,
2490                   const struct tgsi_full_instruction *inst,
2491                   micro_binary_op op,
2492                   enum tgsi_exec_datatype dst_datatype,
2493                   enum tgsi_exec_datatype src_datatype)
2494{
2495   unsigned int chan;
2496   union tgsi_exec_channel src[2];
2497   union tgsi_exec_channel dst;
2498
2499   fetch_source(mach, &src[0], &inst->Src[0], CHAN_X, src_datatype);
2500   fetch_source(mach, &src[1], &inst->Src[1], CHAN_Y, src_datatype);
2501   op(&dst, &src[0], &src[1]);
2502   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2503      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2504         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2505      }
2506   }
2507}
2508
2509static void
2510exec_vector_binary(struct tgsi_exec_machine *mach,
2511                   const struct tgsi_full_instruction *inst,
2512                   micro_binary_op op,
2513                   enum tgsi_exec_datatype dst_datatype,
2514                   enum tgsi_exec_datatype src_datatype)
2515{
2516   unsigned int chan;
2517   struct tgsi_exec_vector dst;
2518
2519   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2520      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2521         union tgsi_exec_channel src[2];
2522
2523         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2524         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2525         op(&dst.xyzw[chan], &src[0], &src[1]);
2526      }
2527   }
2528   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2529      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2530         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2531      }
2532   }
2533}
2534
2535typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2536                                  const union tgsi_exec_channel *src0,
2537                                  const union tgsi_exec_channel *src1,
2538                                  const union tgsi_exec_channel *src2);
2539
2540static void
2541exec_vector_trinary(struct tgsi_exec_machine *mach,
2542                    const struct tgsi_full_instruction *inst,
2543                    micro_trinary_op op,
2544                    enum tgsi_exec_datatype dst_datatype,
2545                    enum tgsi_exec_datatype src_datatype)
2546{
2547   unsigned int chan;
2548   struct tgsi_exec_vector dst;
2549
2550   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2551      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2552         union tgsi_exec_channel src[3];
2553
2554         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2555         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2556         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2557         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2558      }
2559   }
2560   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2561      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2562         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2563      }
2564   }
2565}
2566
2567static void
2568exec_dp3(struct tgsi_exec_machine *mach,
2569         const struct tgsi_full_instruction *inst)
2570{
2571   unsigned int chan;
2572   union tgsi_exec_channel arg[3];
2573
2574   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2575   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2576   micro_mul(&arg[2], &arg[0], &arg[1]);
2577
2578   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2579      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2580      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2581      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2582   }
2583
2584   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2585      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2586         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2587      }
2588   }
2589}
2590
2591static void
2592exec_dp4(struct tgsi_exec_machine *mach,
2593         const struct tgsi_full_instruction *inst)
2594{
2595   unsigned int chan;
2596   union tgsi_exec_channel arg[3];
2597
2598   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2599   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2600   micro_mul(&arg[2], &arg[0], &arg[1]);
2601
2602   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2603      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2604      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2605      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2606   }
2607
2608   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2609      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2610         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2611      }
2612   }
2613}
2614
2615static void
2616exec_dp2a(struct tgsi_exec_machine *mach,
2617          const struct tgsi_full_instruction *inst)
2618{
2619   unsigned int chan;
2620   union tgsi_exec_channel arg[3];
2621
2622   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2623   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2624   micro_mul(&arg[2], &arg[0], &arg[1]);
2625
2626   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2627   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2628   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2629
2630   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2631   micro_add(&arg[0], &arg[0], &arg[1]);
2632
2633   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2634      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2635         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2636      }
2637   }
2638}
2639
2640static void
2641exec_dph(struct tgsi_exec_machine *mach,
2642         const struct tgsi_full_instruction *inst)
2643{
2644   unsigned int chan;
2645   union tgsi_exec_channel arg[3];
2646
2647   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2648   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2649   micro_mul(&arg[2], &arg[0], &arg[1]);
2650
2651   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2652   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2653   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2654
2655   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2656   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2657   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2658
2659   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2660   micro_add(&arg[0], &arg[0], &arg[1]);
2661
2662   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2663      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2664         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2665      }
2666   }
2667}
2668
2669static void
2670exec_dp2(struct tgsi_exec_machine *mach,
2671         const struct tgsi_full_instruction *inst)
2672{
2673   unsigned int chan;
2674   union tgsi_exec_channel arg[3];
2675
2676   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2677   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2678   micro_mul(&arg[2], &arg[0], &arg[1]);
2679
2680   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2681   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2682   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2683
2684   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2685      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2686         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2687      }
2688   }
2689}
2690
2691static void
2692exec_nrm4(struct tgsi_exec_machine *mach,
2693          const struct tgsi_full_instruction *inst)
2694{
2695   unsigned int chan;
2696   union tgsi_exec_channel arg[4];
2697   union tgsi_exec_channel scale;
2698
2699   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2700   micro_mul(&scale, &arg[0], &arg[0]);
2701
2702   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2703      union tgsi_exec_channel product;
2704
2705      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2706      micro_mul(&product, &arg[chan], &arg[chan]);
2707      micro_add(&scale, &scale, &product);
2708   }
2709
2710   micro_rsq(&scale, &scale);
2711
2712   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2713      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2714         micro_mul(&arg[chan], &arg[chan], &scale);
2715         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2716      }
2717   }
2718}
2719
2720static void
2721exec_nrm3(struct tgsi_exec_machine *mach,
2722          const struct tgsi_full_instruction *inst)
2723{
2724   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2725      unsigned int chan;
2726      union tgsi_exec_channel arg[3];
2727      union tgsi_exec_channel scale;
2728
2729      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2730      micro_mul(&scale, &arg[0], &arg[0]);
2731
2732      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2733         union tgsi_exec_channel product;
2734
2735         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2736         micro_mul(&product, &arg[chan], &arg[chan]);
2737         micro_add(&scale, &scale, &product);
2738      }
2739
2740      micro_rsq(&scale, &scale);
2741
2742      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2743         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2744            micro_mul(&arg[chan], &arg[chan], &scale);
2745            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2746         }
2747      }
2748   }
2749
2750   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2751      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2752   }
2753}
2754
2755static void
2756exec_scs(struct tgsi_exec_machine *mach,
2757         const struct tgsi_full_instruction *inst)
2758{
2759   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2760      union tgsi_exec_channel arg;
2761      union tgsi_exec_channel result;
2762
2763      fetch_source(mach, &arg, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2764
2765      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2766         micro_cos(&result, &arg);
2767         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2768      }
2769      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2770         micro_sin(&result, &arg);
2771         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2772      }
2773   }
2774   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2775      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2776   }
2777   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2778      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2779   }
2780}
2781
2782static void
2783exec_x2d(struct tgsi_exec_machine *mach,
2784         const struct tgsi_full_instruction *inst)
2785{
2786   union tgsi_exec_channel r[4];
2787   union tgsi_exec_channel d[2];
2788
2789   fetch_source(mach, &r[0], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2790   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2791   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2792      fetch_source(mach, &r[2], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2793      micro_mul(&r[2], &r[2], &r[0]);
2794      fetch_source(mach, &r[3], &inst->Src[2], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2795      micro_mul(&r[3], &r[3], &r[1]);
2796      micro_add(&r[2], &r[2], &r[3]);
2797      fetch_source(mach, &r[3], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2798      micro_add(&d[0], &r[2], &r[3]);
2799   }
2800   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2801      fetch_source(mach, &r[2], &inst->Src[2], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2802      micro_mul(&r[2], &r[2], &r[0]);
2803      fetch_source(mach, &r[3], &inst->Src[2], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2804      micro_mul(&r[3], &r[3], &r[1]);
2805      micro_add(&r[2], &r[2], &r[3]);
2806      fetch_source(mach, &r[3], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2807      micro_add(&d[1], &r[2], &r[3]);
2808   }
2809   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2810      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2811   }
2812   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2813      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2814   }
2815   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2816      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2817   }
2818   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2819      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2820   }
2821}
2822
2823static void
2824exec_rfl(struct tgsi_exec_machine *mach,
2825         const struct tgsi_full_instruction *inst)
2826{
2827   union tgsi_exec_channel r[9];
2828
2829   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2830      /* r0 = dp3(src0, src0) */
2831      fetch_source(mach, &r[2], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2832      micro_mul(&r[0], &r[2], &r[2]);
2833      fetch_source(mach, &r[4], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2834      micro_mul(&r[8], &r[4], &r[4]);
2835      micro_add(&r[0], &r[0], &r[8]);
2836      fetch_source(mach, &r[6], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2837      micro_mul(&r[8], &r[6], &r[6]);
2838      micro_add(&r[0], &r[0], &r[8]);
2839
2840      /* r1 = dp3(src0, src1) */
2841      fetch_source(mach, &r[3], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2842      micro_mul(&r[1], &r[2], &r[3]);
2843      fetch_source(mach, &r[5], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2844      micro_mul(&r[8], &r[4], &r[5]);
2845      micro_add(&r[1], &r[1], &r[8]);
2846      fetch_source(mach, &r[7], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2847      micro_mul(&r[8], &r[6], &r[7]);
2848      micro_add(&r[1], &r[1], &r[8]);
2849
2850      /* r1 = 2 * r1 / r0 */
2851      micro_add(&r[1], &r[1], &r[1]);
2852      micro_div(&r[1], &r[1], &r[0]);
2853
2854      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2855         micro_mul(&r[2], &r[2], &r[1]);
2856         micro_sub(&r[2], &r[2], &r[3]);
2857         store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2858      }
2859      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2860         micro_mul(&r[4], &r[4], &r[1]);
2861         micro_sub(&r[4], &r[4], &r[5]);
2862         store_dest(mach, &r[4], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2863      }
2864      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2865         micro_mul(&r[6], &r[6], &r[1]);
2866         micro_sub(&r[6], &r[6], &r[7]);
2867         store_dest(mach, &r[6], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2868      }
2869   }
2870   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2871      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2872   }
2873}
2874
2875static void
2876exec_xpd(struct tgsi_exec_machine *mach,
2877         const struct tgsi_full_instruction *inst)
2878{
2879   union tgsi_exec_channel r[6];
2880   union tgsi_exec_channel d[3];
2881
2882   fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2883   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2884
2885   micro_mul(&r[2], &r[0], &r[1]);
2886
2887   fetch_source(mach, &r[3], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2888   fetch_source(mach, &r[4], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2889
2890   micro_mul(&r[5], &r[3], &r[4] );
2891   micro_sub(&d[CHAN_X], &r[2], &r[5]);
2892
2893   fetch_source(mach, &r[2], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2894
2895   micro_mul(&r[3], &r[3], &r[2]);
2896
2897   fetch_source(mach, &r[5], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2898
2899   micro_mul(&r[1], &r[1], &r[5]);
2900   micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2901
2902   micro_mul(&r[5], &r[5], &r[4]);
2903   micro_mul(&r[0], &r[0], &r[2]);
2904   micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2905
2906   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2907      store_dest(mach, &d[CHAN_X], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2908   }
2909   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2910      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2911   }
2912   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2913      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2914   }
2915   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2916      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2917   }
2918}
2919
2920static void
2921exec_dst(struct tgsi_exec_machine *mach,
2922         const struct tgsi_full_instruction *inst)
2923{
2924   union tgsi_exec_channel r[2];
2925   union tgsi_exec_channel d[4];
2926
2927   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2928      fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2929      fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2930      micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2931   }
2932   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2933      fetch_source(mach, &d[CHAN_Z], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2934   }
2935   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2936      fetch_source(mach, &d[CHAN_W], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2937   }
2938
2939   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2940      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2941   }
2942   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2943      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2944   }
2945   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2946      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2947   }
2948   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2949      store_dest(mach, &d[CHAN_W], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2950   }
2951}
2952
2953static void
2954exec_log(struct tgsi_exec_machine *mach,
2955         const struct tgsi_full_instruction *inst)
2956{
2957   union tgsi_exec_channel r[3];
2958
2959   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2960   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2961   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2962   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2963   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2964      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2965   }
2966   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2967      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2968      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2969      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2970   }
2971   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2972      store_dest(mach, &r[1], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2973   }
2974   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2975      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2976   }
2977}
2978
2979static void
2980exec_exp(struct tgsi_exec_machine *mach,
2981         const struct tgsi_full_instruction *inst)
2982{
2983   union tgsi_exec_channel r[3];
2984
2985   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2986   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2987   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2988      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2989      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2990   }
2991   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2992      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2993      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2994   }
2995   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2996      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2997      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2998   }
2999   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3000      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
3001   }
3002}
3003
3004static void
3005exec_lit(struct tgsi_exec_machine *mach,
3006         const struct tgsi_full_instruction *inst)
3007{
3008   union tgsi_exec_channel r[3];
3009   union tgsi_exec_channel d[3];
3010
3011   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3012      fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
3013      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3014         fetch_source(mach, &r[1], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3015         micro_max(&r[1], &r[1], &ZeroVec);
3016
3017         fetch_source(mach, &r[2], &inst->Src[0], CHAN_W, TGSI_EXEC_DATA_FLOAT);
3018         micro_min(&r[2], &r[2], &P128Vec);
3019         micro_max(&r[2], &r[2], &M128Vec);
3020         micro_pow(&r[1], &r[1], &r[2]);
3021         micro_lt(&d[CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3022         store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3023      }
3024      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3025         micro_max(&d[CHAN_Y], &r[0], &ZeroVec);
3026         store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3027      }
3028   }
3029   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3030      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
3031   }
3032
3033   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3034      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
3035   }
3036}
3037
3038static void
3039exec_break(struct tgsi_exec_machine *mach)
3040{
3041   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3042      /* turn off loop channels for each enabled exec channel */
3043      mach->LoopMask &= ~mach->ExecMask;
3044      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3045      UPDATE_EXEC_MASK(mach);
3046   } else {
3047      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3048
3049      mach->Switch.mask = 0x0;
3050
3051      UPDATE_EXEC_MASK(mach);
3052   }
3053}
3054
3055static void
3056exec_switch(struct tgsi_exec_machine *mach,
3057            const struct tgsi_full_instruction *inst)
3058{
3059   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3060   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3061
3062   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3063   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
3064   mach->Switch.mask = 0x0;
3065   mach->Switch.defaultMask = 0x0;
3066
3067   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3068   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3069
3070   UPDATE_EXEC_MASK(mach);
3071}
3072
3073static void
3074exec_case(struct tgsi_exec_machine *mach,
3075          const struct tgsi_full_instruction *inst)
3076{
3077   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3078   union tgsi_exec_channel src;
3079   uint mask = 0;
3080
3081   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
3082
3083   if (mach->Switch.selector.u[0] == src.u[0]) {
3084      mask |= 0x1;
3085   }
3086   if (mach->Switch.selector.u[1] == src.u[1]) {
3087      mask |= 0x2;
3088   }
3089   if (mach->Switch.selector.u[2] == src.u[2]) {
3090      mask |= 0x4;
3091   }
3092   if (mach->Switch.selector.u[3] == src.u[3]) {
3093      mask |= 0x8;
3094   }
3095
3096   mach->Switch.defaultMask |= mask;
3097
3098   mach->Switch.mask |= mask & prevMask;
3099
3100   UPDATE_EXEC_MASK(mach);
3101}
3102
3103static void
3104exec_default(struct tgsi_exec_machine *mach)
3105{
3106   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3107
3108   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3109
3110   UPDATE_EXEC_MASK(mach);
3111}
3112
3113static void
3114exec_endswitch(struct tgsi_exec_machine *mach)
3115{
3116   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3117   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3118
3119   UPDATE_EXEC_MASK(mach);
3120}
3121
3122static void
3123micro_i2f(union tgsi_exec_channel *dst,
3124          const union tgsi_exec_channel *src)
3125{
3126   dst->f[0] = (float)src->i[0];
3127   dst->f[1] = (float)src->i[1];
3128   dst->f[2] = (float)src->i[2];
3129   dst->f[3] = (float)src->i[3];
3130}
3131
3132static void
3133micro_not(union tgsi_exec_channel *dst,
3134          const union tgsi_exec_channel *src)
3135{
3136   dst->u[0] = ~src->u[0];
3137   dst->u[1] = ~src->u[1];
3138   dst->u[2] = ~src->u[2];
3139   dst->u[3] = ~src->u[3];
3140}
3141
3142static void
3143micro_shl(union tgsi_exec_channel *dst,
3144          const union tgsi_exec_channel *src0,
3145          const union tgsi_exec_channel *src1)
3146{
3147   dst->u[0] = src0->u[0] << src1->u[0];
3148   dst->u[1] = src0->u[1] << src1->u[1];
3149   dst->u[2] = src0->u[2] << src1->u[2];
3150   dst->u[3] = src0->u[3] << src1->u[3];
3151}
3152
3153static void
3154micro_and(union tgsi_exec_channel *dst,
3155          const union tgsi_exec_channel *src0,
3156          const union tgsi_exec_channel *src1)
3157{
3158   dst->u[0] = src0->u[0] & src1->u[0];
3159   dst->u[1] = src0->u[1] & src1->u[1];
3160   dst->u[2] = src0->u[2] & src1->u[2];
3161   dst->u[3] = src0->u[3] & src1->u[3];
3162}
3163
3164static void
3165micro_or(union tgsi_exec_channel *dst,
3166         const union tgsi_exec_channel *src0,
3167         const union tgsi_exec_channel *src1)
3168{
3169   dst->u[0] = src0->u[0] | src1->u[0];
3170   dst->u[1] = src0->u[1] | src1->u[1];
3171   dst->u[2] = src0->u[2] | src1->u[2];
3172   dst->u[3] = src0->u[3] | src1->u[3];
3173}
3174
3175static void
3176micro_xor(union tgsi_exec_channel *dst,
3177          const union tgsi_exec_channel *src0,
3178          const union tgsi_exec_channel *src1)
3179{
3180   dst->u[0] = src0->u[0] ^ src1->u[0];
3181   dst->u[1] = src0->u[1] ^ src1->u[1];
3182   dst->u[2] = src0->u[2] ^ src1->u[2];
3183   dst->u[3] = src0->u[3] ^ src1->u[3];
3184}
3185
3186static void
3187micro_mod(union tgsi_exec_channel *dst,
3188          const union tgsi_exec_channel *src0,
3189          const union tgsi_exec_channel *src1)
3190{
3191   dst->i[0] = src0->i[0] % src1->i[0];
3192   dst->i[1] = src0->i[1] % src1->i[1];
3193   dst->i[2] = src0->i[2] % src1->i[2];
3194   dst->i[3] = src0->i[3] % src1->i[3];
3195}
3196
3197static void
3198micro_f2i(union tgsi_exec_channel *dst,
3199          const union tgsi_exec_channel *src)
3200{
3201   dst->i[0] = (int)src->f[0];
3202   dst->i[1] = (int)src->f[1];
3203   dst->i[2] = (int)src->f[2];
3204   dst->i[3] = (int)src->f[3];
3205}
3206
3207static void
3208micro_idiv(union tgsi_exec_channel *dst,
3209           const union tgsi_exec_channel *src0,
3210           const union tgsi_exec_channel *src1)
3211{
3212   dst->i[0] = src0->i[0] / src1->i[0];
3213   dst->i[1] = src0->i[1] / src1->i[1];
3214   dst->i[2] = src0->i[2] / src1->i[2];
3215   dst->i[3] = src0->i[3] / src1->i[3];
3216}
3217
3218static void
3219micro_imax(union tgsi_exec_channel *dst,
3220           const union tgsi_exec_channel *src0,
3221           const union tgsi_exec_channel *src1)
3222{
3223   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
3224   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
3225   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
3226   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
3227}
3228
3229static void
3230micro_imin(union tgsi_exec_channel *dst,
3231           const union tgsi_exec_channel *src0,
3232           const union tgsi_exec_channel *src1)
3233{
3234   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
3235   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
3236   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
3237   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
3238}
3239
3240static void
3241micro_isge(union tgsi_exec_channel *dst,
3242           const union tgsi_exec_channel *src0,
3243           const union tgsi_exec_channel *src1)
3244{
3245   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
3246   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
3247   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
3248   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
3249}
3250
3251static void
3252micro_ishr(union tgsi_exec_channel *dst,
3253           const union tgsi_exec_channel *src0,
3254           const union tgsi_exec_channel *src1)
3255{
3256   dst->i[0] = src0->i[0] >> src1->i[0];
3257   dst->i[1] = src0->i[1] >> src1->i[1];
3258   dst->i[2] = src0->i[2] >> src1->i[2];
3259   dst->i[3] = src0->i[3] >> src1->i[3];
3260}
3261
3262static void
3263micro_islt(union tgsi_exec_channel *dst,
3264           const union tgsi_exec_channel *src0,
3265           const union tgsi_exec_channel *src1)
3266{
3267   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3268   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3269   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3270   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3271}
3272
3273static void
3274micro_f2u(union tgsi_exec_channel *dst,
3275          const union tgsi_exec_channel *src)
3276{
3277   dst->u[0] = (uint)src->f[0];
3278   dst->u[1] = (uint)src->f[1];
3279   dst->u[2] = (uint)src->f[2];
3280   dst->u[3] = (uint)src->f[3];
3281}
3282
3283static void
3284micro_u2f(union tgsi_exec_channel *dst,
3285          const union tgsi_exec_channel *src)
3286{
3287   dst->f[0] = (float)src->u[0];
3288   dst->f[1] = (float)src->u[1];
3289   dst->f[2] = (float)src->u[2];
3290   dst->f[3] = (float)src->u[3];
3291}
3292
3293static void
3294micro_uadd(union tgsi_exec_channel *dst,
3295           const union tgsi_exec_channel *src0,
3296           const union tgsi_exec_channel *src1)
3297{
3298   dst->u[0] = src0->u[0] + src1->u[0];
3299   dst->u[1] = src0->u[1] + src1->u[1];
3300   dst->u[2] = src0->u[2] + src1->u[2];
3301   dst->u[3] = src0->u[3] + src1->u[3];
3302}
3303
3304static void
3305micro_udiv(union tgsi_exec_channel *dst,
3306           const union tgsi_exec_channel *src0,
3307           const union tgsi_exec_channel *src1)
3308{
3309   dst->u[0] = src0->u[0] / src1->u[0];
3310   dst->u[1] = src0->u[1] / src1->u[1];
3311   dst->u[2] = src0->u[2] / src1->u[2];
3312   dst->u[3] = src0->u[3] / src1->u[3];
3313}
3314
3315static void
3316micro_umad(union tgsi_exec_channel *dst,
3317           const union tgsi_exec_channel *src0,
3318           const union tgsi_exec_channel *src1,
3319           const union tgsi_exec_channel *src2)
3320{
3321   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3322   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3323   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3324   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3325}
3326
3327static void
3328micro_umax(union tgsi_exec_channel *dst,
3329           const union tgsi_exec_channel *src0,
3330           const union tgsi_exec_channel *src1)
3331{
3332   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3333   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3334   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3335   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3336}
3337
3338static void
3339micro_umin(union tgsi_exec_channel *dst,
3340           const union tgsi_exec_channel *src0,
3341           const union tgsi_exec_channel *src1)
3342{
3343   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3344   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3345   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3346   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3347}
3348
3349static void
3350micro_umod(union tgsi_exec_channel *dst,
3351           const union tgsi_exec_channel *src0,
3352           const union tgsi_exec_channel *src1)
3353{
3354   dst->u[0] = src0->u[0] % src1->u[0];
3355   dst->u[1] = src0->u[1] % src1->u[1];
3356   dst->u[2] = src0->u[2] % src1->u[2];
3357   dst->u[3] = src0->u[3] % src1->u[3];
3358}
3359
3360static void
3361micro_umul(union tgsi_exec_channel *dst,
3362           const union tgsi_exec_channel *src0,
3363           const union tgsi_exec_channel *src1)
3364{
3365   dst->u[0] = src0->u[0] * src1->u[0];
3366   dst->u[1] = src0->u[1] * src1->u[1];
3367   dst->u[2] = src0->u[2] * src1->u[2];
3368   dst->u[3] = src0->u[3] * src1->u[3];
3369}
3370
3371static void
3372micro_useq(union tgsi_exec_channel *dst,
3373           const union tgsi_exec_channel *src0,
3374           const union tgsi_exec_channel *src1)
3375{
3376   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3377   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3378   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3379   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3380}
3381
3382static void
3383micro_usge(union tgsi_exec_channel *dst,
3384           const union tgsi_exec_channel *src0,
3385           const union tgsi_exec_channel *src1)
3386{
3387   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3388   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3389   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3390   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3391}
3392
3393static void
3394micro_ushr(union tgsi_exec_channel *dst,
3395           const union tgsi_exec_channel *src0,
3396           const union tgsi_exec_channel *src1)
3397{
3398   dst->u[0] = src0->u[0] >> src1->u[0];
3399   dst->u[1] = src0->u[1] >> src1->u[1];
3400   dst->u[2] = src0->u[2] >> src1->u[2];
3401   dst->u[3] = src0->u[3] >> src1->u[3];
3402}
3403
3404static void
3405micro_uslt(union tgsi_exec_channel *dst,
3406           const union tgsi_exec_channel *src0,
3407           const union tgsi_exec_channel *src1)
3408{
3409   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3410   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3411   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3412   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3413}
3414
3415static void
3416micro_usne(union tgsi_exec_channel *dst,
3417           const union tgsi_exec_channel *src0,
3418           const union tgsi_exec_channel *src1)
3419{
3420   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3421   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3422   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3423   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3424}
3425
3426static void
3427micro_uarl(union tgsi_exec_channel *dst,
3428           const union tgsi_exec_channel *src)
3429{
3430   dst->i[0] = src->u[0];
3431   dst->i[1] = src->u[1];
3432   dst->i[2] = src->u[2];
3433   dst->i[3] = src->u[3];
3434}
3435
3436static void
3437micro_ucmp(union tgsi_exec_channel *dst,
3438           const union tgsi_exec_channel *src0,
3439           const union tgsi_exec_channel *src1,
3440           const union tgsi_exec_channel *src2)
3441{
3442   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
3443   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
3444   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
3445   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
3446}
3447
3448static void
3449exec_instruction(
3450   struct tgsi_exec_machine *mach,
3451   const struct tgsi_full_instruction *inst,
3452   int *pc )
3453{
3454   union tgsi_exec_channel r[10];
3455
3456   (*pc)++;
3457
3458   switch (inst->Instruction.Opcode) {
3459   case TGSI_OPCODE_ARL:
3460      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3461      break;
3462
3463   case TGSI_OPCODE_MOV:
3464      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3465      break;
3466
3467   case TGSI_OPCODE_LIT:
3468      exec_lit(mach, inst);
3469      break;
3470
3471   case TGSI_OPCODE_RCP:
3472      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3473      break;
3474
3475   case TGSI_OPCODE_RSQ:
3476      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3477      break;
3478
3479   case TGSI_OPCODE_EXP:
3480      exec_exp(mach, inst);
3481      break;
3482
3483   case TGSI_OPCODE_LOG:
3484      exec_log(mach, inst);
3485      break;
3486
3487   case TGSI_OPCODE_MUL:
3488      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3489      break;
3490
3491   case TGSI_OPCODE_ADD:
3492      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3493      break;
3494
3495   case TGSI_OPCODE_DP3:
3496      exec_dp3(mach, inst);
3497      break;
3498
3499   case TGSI_OPCODE_DP4:
3500      exec_dp4(mach, inst);
3501      break;
3502
3503   case TGSI_OPCODE_DST:
3504      exec_dst(mach, inst);
3505      break;
3506
3507   case TGSI_OPCODE_MIN:
3508      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3509      break;
3510
3511   case TGSI_OPCODE_MAX:
3512      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3513      break;
3514
3515   case TGSI_OPCODE_SLT:
3516      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3517      break;
3518
3519   case TGSI_OPCODE_SGE:
3520      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3521      break;
3522
3523   case TGSI_OPCODE_MAD:
3524      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3525      break;
3526
3527   case TGSI_OPCODE_SUB:
3528      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3529      break;
3530
3531   case TGSI_OPCODE_LRP:
3532      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3533      break;
3534
3535   case TGSI_OPCODE_CND:
3536      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3537      break;
3538
3539   case TGSI_OPCODE_DP2A:
3540      exec_dp2a(mach, inst);
3541      break;
3542
3543   case TGSI_OPCODE_FRC:
3544      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3545      break;
3546
3547   case TGSI_OPCODE_CLAMP:
3548      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3549      break;
3550
3551   case TGSI_OPCODE_FLR:
3552      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3553      break;
3554
3555   case TGSI_OPCODE_ROUND:
3556      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3557      break;
3558
3559   case TGSI_OPCODE_EX2:
3560      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3561      break;
3562
3563   case TGSI_OPCODE_LG2:
3564      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3565      break;
3566
3567   case TGSI_OPCODE_POW:
3568      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3569      break;
3570
3571   case TGSI_OPCODE_XPD:
3572      exec_xpd(mach, inst);
3573      break;
3574
3575   case TGSI_OPCODE_ABS:
3576      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3577      break;
3578
3579   case TGSI_OPCODE_RCC:
3580      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3581      break;
3582
3583   case TGSI_OPCODE_DPH:
3584      exec_dph(mach, inst);
3585      break;
3586
3587   case TGSI_OPCODE_COS:
3588      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3589      break;
3590
3591   case TGSI_OPCODE_DDX:
3592      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3593      break;
3594
3595   case TGSI_OPCODE_DDY:
3596      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3597      break;
3598
3599   case TGSI_OPCODE_KILP:
3600      exec_kilp (mach, inst);
3601      break;
3602
3603   case TGSI_OPCODE_KIL:
3604      exec_kil (mach, inst);
3605      break;
3606
3607   case TGSI_OPCODE_PK2H:
3608      assert (0);
3609      break;
3610
3611   case TGSI_OPCODE_PK2US:
3612      assert (0);
3613      break;
3614
3615   case TGSI_OPCODE_PK4B:
3616      assert (0);
3617      break;
3618
3619   case TGSI_OPCODE_PK4UB:
3620      assert (0);
3621      break;
3622
3623   case TGSI_OPCODE_RFL:
3624      exec_rfl(mach, inst);
3625      break;
3626
3627   case TGSI_OPCODE_SEQ:
3628      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3629      break;
3630
3631   case TGSI_OPCODE_SFL:
3632      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3633      break;
3634
3635   case TGSI_OPCODE_SGT:
3636      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3637      break;
3638
3639   case TGSI_OPCODE_SIN:
3640      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3641      break;
3642
3643   case TGSI_OPCODE_SLE:
3644      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3645      break;
3646
3647   case TGSI_OPCODE_SNE:
3648      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3649      break;
3650
3651   case TGSI_OPCODE_STR:
3652      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3653      break;
3654
3655   case TGSI_OPCODE_TEX:
3656      /* simple texture lookup */
3657      /* src[0] = texcoord */
3658      /* src[1] = sampler unit */
3659      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3660      break;
3661
3662   case TGSI_OPCODE_TXB:
3663      /* Texture lookup with lod bias */
3664      /* src[0] = texcoord (src[0].w = LOD bias) */
3665      /* src[1] = sampler unit */
3666      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3667      break;
3668
3669   case TGSI_OPCODE_TXD:
3670      /* Texture lookup with explict partial derivatives */
3671      /* src[0] = texcoord */
3672      /* src[1] = d[strq]/dx */
3673      /* src[2] = d[strq]/dy */
3674      /* src[3] = sampler unit */
3675      exec_txd(mach, inst);
3676      break;
3677
3678   case TGSI_OPCODE_TXL:
3679      /* Texture lookup with explit LOD */
3680      /* src[0] = texcoord (src[0].w = LOD) */
3681      /* src[1] = sampler unit */
3682      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3683      break;
3684
3685   case TGSI_OPCODE_TXP:
3686      /* Texture lookup with projection */
3687      /* src[0] = texcoord (src[0].w = projection) */
3688      /* src[1] = sampler unit */
3689      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3690      break;
3691
3692   case TGSI_OPCODE_UP2H:
3693      assert (0);
3694      break;
3695
3696   case TGSI_OPCODE_UP2US:
3697      assert (0);
3698      break;
3699
3700   case TGSI_OPCODE_UP4B:
3701      assert (0);
3702      break;
3703
3704   case TGSI_OPCODE_UP4UB:
3705      assert (0);
3706      break;
3707
3708   case TGSI_OPCODE_X2D:
3709      exec_x2d(mach, inst);
3710      break;
3711
3712   case TGSI_OPCODE_ARA:
3713      assert (0);
3714      break;
3715
3716   case TGSI_OPCODE_ARR:
3717      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3718      break;
3719
3720   case TGSI_OPCODE_BRA:
3721      assert (0);
3722      break;
3723
3724   case TGSI_OPCODE_CAL:
3725      /* skip the call if no execution channels are enabled */
3726      if (mach->ExecMask) {
3727         /* do the call */
3728
3729         /* First, record the depths of the execution stacks.
3730          * This is important for deeply nested/looped return statements.
3731          * We have to unwind the stacks by the correct amount.  For a
3732          * real code generator, we could determine the number of entries
3733          * to pop off each stack with simple static analysis and avoid
3734          * implementing this data structure at run time.
3735          */
3736         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3737         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3738         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3739         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3740         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3741         /* note that PC was already incremented above */
3742         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3743
3744         mach->CallStackTop++;
3745
3746         /* Second, push the Cond, Loop, Cont, Func stacks */
3747         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3748         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3749         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3750         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3751         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3752         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3753
3754         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3755         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3756         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3757         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3758         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3759         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3760
3761         /* Finally, jump to the subroutine */
3762         *pc = inst->Label.Label;
3763      }
3764      break;
3765
3766   case TGSI_OPCODE_RET:
3767      mach->FuncMask &= ~mach->ExecMask;
3768      UPDATE_EXEC_MASK(mach);
3769
3770      if (mach->FuncMask == 0x0) {
3771         /* really return now (otherwise, keep executing */
3772
3773         if (mach->CallStackTop == 0) {
3774            /* returning from main() */
3775            mach->CondStackTop = 0;
3776            mach->LoopStackTop = 0;
3777            *pc = -1;
3778            return;
3779         }
3780
3781         assert(mach->CallStackTop > 0);
3782         mach->CallStackTop--;
3783
3784         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3785         mach->CondMask = mach->CondStack[mach->CondStackTop];
3786
3787         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3788         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3789
3790         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3791         mach->ContMask = mach->ContStack[mach->ContStackTop];
3792
3793         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3794         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3795
3796         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3797         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3798
3799         assert(mach->FuncStackTop > 0);
3800         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3801
3802         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3803
3804         UPDATE_EXEC_MASK(mach);
3805      }
3806      break;
3807
3808   case TGSI_OPCODE_SSG:
3809      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3810      break;
3811
3812   case TGSI_OPCODE_CMP:
3813      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3814      break;
3815
3816   case TGSI_OPCODE_SCS:
3817      exec_scs(mach, inst);
3818      break;
3819
3820   case TGSI_OPCODE_NRM:
3821      exec_nrm3(mach, inst);
3822      break;
3823
3824   case TGSI_OPCODE_NRM4:
3825      exec_nrm4(mach, inst);
3826      break;
3827
3828   case TGSI_OPCODE_DIV:
3829      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3830      break;
3831
3832   case TGSI_OPCODE_DP2:
3833      exec_dp2(mach, inst);
3834      break;
3835
3836   case TGSI_OPCODE_IF:
3837      /* push CondMask */
3838      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3839      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3840      FETCH( &r[0], 0, CHAN_X );
3841      /* update CondMask */
3842      if( ! r[0].u[0] ) {
3843         mach->CondMask &= ~0x1;
3844      }
3845      if( ! r[0].u[1] ) {
3846         mach->CondMask &= ~0x2;
3847      }
3848      if( ! r[0].u[2] ) {
3849         mach->CondMask &= ~0x4;
3850      }
3851      if( ! r[0].u[3] ) {
3852         mach->CondMask &= ~0x8;
3853      }
3854      UPDATE_EXEC_MASK(mach);
3855      /* Todo: If CondMask==0, jump to ELSE */
3856      break;
3857
3858   case TGSI_OPCODE_ELSE:
3859      /* invert CondMask wrt previous mask */
3860      {
3861         uint prevMask;
3862         assert(mach->CondStackTop > 0);
3863         prevMask = mach->CondStack[mach->CondStackTop - 1];
3864         mach->CondMask = ~mach->CondMask & prevMask;
3865         UPDATE_EXEC_MASK(mach);
3866         /* Todo: If CondMask==0, jump to ENDIF */
3867      }
3868      break;
3869
3870   case TGSI_OPCODE_ENDIF:
3871      /* pop CondMask */
3872      assert(mach->CondStackTop > 0);
3873      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3874      UPDATE_EXEC_MASK(mach);
3875      break;
3876
3877   case TGSI_OPCODE_END:
3878      /* make sure we end primitives which haven't
3879       * been explicitly emitted */
3880      conditional_emit_primitive(mach);
3881      /* halt execution */
3882      *pc = -1;
3883      break;
3884
3885   case TGSI_OPCODE_PUSHA:
3886      assert (0);
3887      break;
3888
3889   case TGSI_OPCODE_POPA:
3890      assert (0);
3891      break;
3892
3893   case TGSI_OPCODE_CEIL:
3894      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3895      break;
3896
3897   case TGSI_OPCODE_I2F:
3898      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3899      break;
3900
3901   case TGSI_OPCODE_NOT:
3902      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3903      break;
3904
3905   case TGSI_OPCODE_TRUNC:
3906      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3907      break;
3908
3909   case TGSI_OPCODE_SHL:
3910      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3911      break;
3912
3913   case TGSI_OPCODE_AND:
3914      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3915      break;
3916
3917   case TGSI_OPCODE_OR:
3918      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3919      break;
3920
3921   case TGSI_OPCODE_MOD:
3922      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3923      break;
3924
3925   case TGSI_OPCODE_XOR:
3926      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3927      break;
3928
3929   case TGSI_OPCODE_SAD:
3930      assert (0);
3931      break;
3932
3933   case TGSI_OPCODE_TXF:
3934      exec_txf(mach, inst);
3935      break;
3936
3937   case TGSI_OPCODE_TXQ:
3938      exec_txq(mach, inst);
3939      break;
3940
3941   case TGSI_OPCODE_EMIT:
3942      emit_vertex(mach);
3943      break;
3944
3945   case TGSI_OPCODE_ENDPRIM:
3946      emit_primitive(mach);
3947      break;
3948
3949   case TGSI_OPCODE_BGNLOOP:
3950      /* push LoopMask and ContMasks */
3951      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3952      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3953      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3954      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3955
3956      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3957      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3958      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3959      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3960      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3961      break;
3962
3963   case TGSI_OPCODE_ENDLOOP:
3964      /* Restore ContMask, but don't pop */
3965      assert(mach->ContStackTop > 0);
3966      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3967      UPDATE_EXEC_MASK(mach);
3968      if (mach->ExecMask) {
3969         /* repeat loop: jump to instruction just past BGNLOOP */
3970         assert(mach->LoopLabelStackTop > 0);
3971         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3972      }
3973      else {
3974         /* exit loop: pop LoopMask */
3975         assert(mach->LoopStackTop > 0);
3976         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3977         /* pop ContMask */
3978         assert(mach->ContStackTop > 0);
3979         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3980         assert(mach->LoopLabelStackTop > 0);
3981         --mach->LoopLabelStackTop;
3982
3983         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3984      }
3985      UPDATE_EXEC_MASK(mach);
3986      break;
3987
3988   case TGSI_OPCODE_BRK:
3989      exec_break(mach);
3990      break;
3991
3992   case TGSI_OPCODE_CONT:
3993      /* turn off cont channels for each enabled exec channel */
3994      mach->ContMask &= ~mach->ExecMask;
3995      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3996      UPDATE_EXEC_MASK(mach);
3997      break;
3998
3999   case TGSI_OPCODE_BGNSUB:
4000      /* no-op */
4001      break;
4002
4003   case TGSI_OPCODE_ENDSUB:
4004      /*
4005       * XXX: This really should be a no-op. We should never reach this opcode.
4006       */
4007
4008      assert(mach->CallStackTop > 0);
4009      mach->CallStackTop--;
4010
4011      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
4012      mach->CondMask = mach->CondStack[mach->CondStackTop];
4013
4014      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
4015      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
4016
4017      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
4018      mach->ContMask = mach->ContStack[mach->ContStackTop];
4019
4020      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
4021      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
4022
4023      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
4024      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
4025
4026      assert(mach->FuncStackTop > 0);
4027      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
4028
4029      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
4030
4031      UPDATE_EXEC_MASK(mach);
4032      break;
4033
4034   case TGSI_OPCODE_NOP:
4035      break;
4036
4037   case TGSI_OPCODE_BREAKC:
4038      FETCH(&r[0], 0, CHAN_X);
4039      /* update CondMask */
4040      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
4041         mach->LoopMask &= ~0x1;
4042      }
4043      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
4044         mach->LoopMask &= ~0x2;
4045      }
4046      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
4047         mach->LoopMask &= ~0x4;
4048      }
4049      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
4050         mach->LoopMask &= ~0x8;
4051      }
4052      /* Todo: if mach->LoopMask == 0, jump to end of loop */
4053      UPDATE_EXEC_MASK(mach);
4054      break;
4055
4056   case TGSI_OPCODE_F2I:
4057      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
4058      break;
4059
4060   case TGSI_OPCODE_IDIV:
4061      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4062      break;
4063
4064   case TGSI_OPCODE_IMAX:
4065      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4066      break;
4067
4068   case TGSI_OPCODE_IMIN:
4069      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4070      break;
4071
4072   case TGSI_OPCODE_INEG:
4073      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4074      break;
4075
4076   case TGSI_OPCODE_ISGE:
4077      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4078      break;
4079
4080   case TGSI_OPCODE_ISHR:
4081      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4082      break;
4083
4084   case TGSI_OPCODE_ISLT:
4085      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4086      break;
4087
4088   case TGSI_OPCODE_F2U:
4089      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
4090      break;
4091
4092   case TGSI_OPCODE_U2F:
4093      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
4094      break;
4095
4096   case TGSI_OPCODE_UADD:
4097      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4098      break;
4099
4100   case TGSI_OPCODE_UDIV:
4101      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4102      break;
4103
4104   case TGSI_OPCODE_UMAD:
4105      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4106      break;
4107
4108   case TGSI_OPCODE_UMAX:
4109      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4110      break;
4111
4112   case TGSI_OPCODE_UMIN:
4113      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4114      break;
4115
4116   case TGSI_OPCODE_UMOD:
4117      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4118      break;
4119
4120   case TGSI_OPCODE_UMUL:
4121      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4122      break;
4123
4124   case TGSI_OPCODE_USEQ:
4125      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4126      break;
4127
4128   case TGSI_OPCODE_USGE:
4129      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4130      break;
4131
4132   case TGSI_OPCODE_USHR:
4133      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4134      break;
4135
4136   case TGSI_OPCODE_USLT:
4137      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4138      break;
4139
4140   case TGSI_OPCODE_USNE:
4141      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4142      break;
4143
4144   case TGSI_OPCODE_SWITCH:
4145      exec_switch(mach, inst);
4146      break;
4147
4148   case TGSI_OPCODE_CASE:
4149      exec_case(mach, inst);
4150      break;
4151
4152   case TGSI_OPCODE_DEFAULT:
4153      exec_default(mach);
4154      break;
4155
4156   case TGSI_OPCODE_ENDSWITCH:
4157      exec_endswitch(mach);
4158      break;
4159
4160   case TGSI_OPCODE_LOAD:
4161      assert(0);
4162      break;
4163
4164   case TGSI_OPCODE_LOAD_MS:
4165      assert(0);
4166      break;
4167
4168   case TGSI_OPCODE_SAMPLE:
4169      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4170      break;
4171
4172   case TGSI_OPCODE_SAMPLE_B:
4173      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4174      break;
4175
4176   case TGSI_OPCODE_SAMPLE_C:
4177      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4178      break;
4179
4180   case TGSI_OPCODE_SAMPLE_C_LZ:
4181      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4182      break;
4183
4184   case TGSI_OPCODE_SAMPLE_D:
4185      exec_sample_d(mach, inst);
4186      break;
4187
4188   case TGSI_OPCODE_SAMPLE_L:
4189      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
4190      break;
4191
4192   case TGSI_OPCODE_GATHER4:
4193      assert(0);
4194      break;
4195
4196   case TGSI_OPCODE_RESINFO:
4197      assert(0);
4198      break;
4199
4200   case TGSI_OPCODE_SAMPLE_POS:
4201      assert(0);
4202      break;
4203
4204   case TGSI_OPCODE_SAMPLE_INFO:
4205      assert(0);
4206      break;
4207
4208   case TGSI_OPCODE_UARL:
4209      exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
4210      break;
4211
4212   case TGSI_OPCODE_UCMP:
4213      exec_vector_trinary(mach, inst, micro_ucmp, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4214      break;
4215
4216   case TGSI_OPCODE_IABS:
4217      exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4218      break;
4219
4220   case TGSI_OPCODE_ISSG:
4221      exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4222      break;
4223
4224   default:
4225      assert( 0 );
4226   }
4227}
4228
4229
4230#define DEBUG_EXECUTION 0
4231
4232
4233/**
4234 * Run TGSI interpreter.
4235 * \return bitmask of "alive" quad components
4236 */
4237uint
4238tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
4239{
4240   uint i;
4241   int pc = 0;
4242
4243   mach->CondMask = 0xf;
4244   mach->LoopMask = 0xf;
4245   mach->ContMask = 0xf;
4246   mach->FuncMask = 0xf;
4247   mach->ExecMask = 0xf;
4248
4249   mach->Switch.mask = 0xf;
4250
4251   assert(mach->CondStackTop == 0);
4252   assert(mach->LoopStackTop == 0);
4253   assert(mach->ContStackTop == 0);
4254   assert(mach->SwitchStackTop == 0);
4255   assert(mach->BreakStackTop == 0);
4256   assert(mach->CallStackTop == 0);
4257
4258   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
4259   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
4260
4261   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
4262      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
4263      mach->Primitives[0] = 0;
4264   }
4265
4266   /* execute declarations (interpolants) */
4267   for (i = 0; i < mach->NumDeclarations; i++) {
4268      exec_declaration( mach, mach->Declarations+i );
4269   }
4270
4271   {
4272#if DEBUG_EXECUTION
4273      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
4274      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
4275      uint inst = 1;
4276
4277      memcpy(temps, mach->Temps, sizeof(temps));
4278      memcpy(outputs, mach->Outputs, sizeof(outputs));
4279#endif
4280
4281      /* execute instructions, until pc is set to -1 */
4282      while (pc != -1) {
4283
4284#if DEBUG_EXECUTION
4285         uint i;
4286
4287         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
4288#endif
4289
4290         assert(pc < (int) mach->NumInstructions);
4291         exec_instruction(mach, mach->Instructions + pc, &pc);
4292
4293#if DEBUG_EXECUTION
4294         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4295            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4296               uint j;
4297
4298               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4299               debug_printf("TEMP[%2u] = ", i);
4300               for (j = 0; j < 4; j++) {
4301                  if (j > 0) {
4302                     debug_printf("           ");
4303                  }
4304                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4305                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4306                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4307                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4308                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4309               }
4310            }
4311         }
4312         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4313            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4314               uint j;
4315
4316               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4317               debug_printf("OUT[%2u] =  ", i);
4318               for (j = 0; j < 4; j++) {
4319                  if (j > 0) {
4320                     debug_printf("           ");
4321                  }
4322                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4323                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4324                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4325                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4326                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4327               }
4328            }
4329         }
4330#endif
4331      }
4332   }
4333
4334#if 0
4335   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4336   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4337      /*
4338       * Scale back depth component.
4339       */
4340      for (i = 0; i < 4; i++)
4341         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4342   }
4343#endif
4344
4345   /* Strictly speaking, these assertions aren't really needed but they
4346    * can potentially catch some bugs in the control flow code.
4347    */
4348   assert(mach->CondStackTop == 0);
4349   assert(mach->LoopStackTop == 0);
4350   assert(mach->ContStackTop == 0);
4351   assert(mach->SwitchStackTop == 0);
4352   assert(mach->BreakStackTop == 0);
4353   assert(mach->CallStackTop == 0);
4354
4355   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4356}
4357