1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 0
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_isgn(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src)
380{
381   dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
382   dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
383   dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
384   dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
385}
386
387static void
388micro_sgt(union tgsi_exec_channel *dst,
389          const union tgsi_exec_channel *src0,
390          const union tgsi_exec_channel *src1)
391{
392   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
393   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
394   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
395   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
396}
397
398static void
399micro_sin(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src)
401{
402   dst->f[0] = sinf(src->f[0]);
403   dst->f[1] = sinf(src->f[1]);
404   dst->f[2] = sinf(src->f[2]);
405   dst->f[3] = sinf(src->f[3]);
406}
407
408static void
409micro_sle(union tgsi_exec_channel *dst,
410          const union tgsi_exec_channel *src0,
411          const union tgsi_exec_channel *src1)
412{
413   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
414   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
415   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
416   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
417}
418
419static void
420micro_slt(union tgsi_exec_channel *dst,
421          const union tgsi_exec_channel *src0,
422          const union tgsi_exec_channel *src1)
423{
424   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
425   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
426   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
427   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
428}
429
430static void
431micro_sne(union tgsi_exec_channel *dst,
432          const union tgsi_exec_channel *src0,
433          const union tgsi_exec_channel *src1)
434{
435   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
436   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
437   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
438   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
439}
440
441static void
442micro_sfl(union tgsi_exec_channel *dst)
443{
444   dst->f[0] = 0.0f;
445   dst->f[1] = 0.0f;
446   dst->f[2] = 0.0f;
447   dst->f[3] = 0.0f;
448}
449
450static void
451micro_str(union tgsi_exec_channel *dst)
452{
453   dst->f[0] = 1.0f;
454   dst->f[1] = 1.0f;
455   dst->f[2] = 1.0f;
456   dst->f[3] = 1.0f;
457}
458
459static void
460micro_trunc(union tgsi_exec_channel *dst,
461            const union tgsi_exec_channel *src)
462{
463   dst->f[0] = (float)(int)src->f[0];
464   dst->f[1] = (float)(int)src->f[1];
465   dst->f[2] = (float)(int)src->f[2];
466   dst->f[3] = (float)(int)src->f[3];
467}
468
469
470enum tgsi_exec_datatype {
471   TGSI_EXEC_DATA_FLOAT,
472   TGSI_EXEC_DATA_INT,
473   TGSI_EXEC_DATA_UINT
474};
475
476/*
477 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
478 */
479#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
480#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
481#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
482#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
483#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
484#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
485
486
487/** The execution mask depends on the conditional mask and the loop mask */
488#define UPDATE_EXEC_MASK(MACH) \
489      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
490
491
492static const union tgsi_exec_channel ZeroVec =
493   { { 0.0, 0.0, 0.0, 0.0 } };
494
495static const union tgsi_exec_channel OneVec = {
496   {1.0f, 1.0f, 1.0f, 1.0f}
497};
498
499static const union tgsi_exec_channel P128Vec = {
500   {128.0f, 128.0f, 128.0f, 128.0f}
501};
502
503static const union tgsi_exec_channel M128Vec = {
504   {-128.0f, -128.0f, -128.0f, -128.0f}
505};
506
507
508/**
509 * Assert that none of the float values in 'chan' are infinite or NaN.
510 * NaN and Inf may occur normally during program execution and should
511 * not lead to crashes, etc.  But when debugging, it's helpful to catch
512 * them.
513 */
514static INLINE void
515check_inf_or_nan(const union tgsi_exec_channel *chan)
516{
517   assert(!util_is_inf_or_nan((chan)->f[0]));
518   assert(!util_is_inf_or_nan((chan)->f[1]));
519   assert(!util_is_inf_or_nan((chan)->f[2]));
520   assert(!util_is_inf_or_nan((chan)->f[3]));
521}
522
523
524#ifdef DEBUG
525static void
526print_chan(const char *msg, const union tgsi_exec_channel *chan)
527{
528   debug_printf("%s = {%f, %f, %f, %f}\n",
529                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
530}
531#endif
532
533
534#ifdef DEBUG
535static void
536print_temp(const struct tgsi_exec_machine *mach, uint index)
537{
538   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
539   int i;
540   debug_printf("Temp[%u] =\n", index);
541   for (i = 0; i < 4; i++) {
542      debug_printf("  %c: { %f, %f, %f, %f }\n",
543                   "XYZW"[i],
544                   tmp->xyzw[i].f[0],
545                   tmp->xyzw[i].f[1],
546                   tmp->xyzw[i].f[2],
547                   tmp->xyzw[i].f[3]);
548   }
549}
550#endif
551
552
553void
554tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
555                               unsigned num_bufs,
556                               const void **bufs,
557                               const unsigned *buf_sizes)
558{
559   unsigned i;
560
561   for (i = 0; i < num_bufs; i++) {
562      mach->Consts[i] = bufs[i];
563      mach->ConstsSize[i] = buf_sizes[i];
564   }
565}
566
567
568/**
569 * Check if there's a potential src/dst register data dependency when
570 * using SOA execution.
571 * Example:
572 *   MOV T, T.yxwz;
573 * This would expand into:
574 *   MOV t0, t1;
575 *   MOV t1, t0;
576 *   MOV t2, t3;
577 *   MOV t3, t2;
578 * The second instruction will have the wrong value for t0 if executed as-is.
579 */
580boolean
581tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
582{
583   uint i, chan;
584
585   uint writemask = inst->Dst[0].Register.WriteMask;
586   if (writemask == TGSI_WRITEMASK_X ||
587       writemask == TGSI_WRITEMASK_Y ||
588       writemask == TGSI_WRITEMASK_Z ||
589       writemask == TGSI_WRITEMASK_W ||
590       writemask == TGSI_WRITEMASK_NONE) {
591      /* no chance of data dependency */
592      return FALSE;
593   }
594
595   /* loop over src regs */
596   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
597      if ((inst->Src[i].Register.File ==
598           inst->Dst[0].Register.File) &&
599          ((inst->Src[i].Register.Index ==
600            inst->Dst[0].Register.Index) ||
601           inst->Src[i].Register.Indirect ||
602           inst->Dst[0].Register.Indirect)) {
603         /* loop over dest channels */
604         uint channelsWritten = 0x0;
605         for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
606            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
607               /* check if we're reading a channel that's been written */
608               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
609               if (channelsWritten & (1 << swizzle)) {
610                  return TRUE;
611               }
612
613               channelsWritten |= (1 << chan);
614            }
615         }
616      }
617   }
618   return FALSE;
619}
620
621
622/**
623 * Initialize machine state by expanding tokens to full instructions,
624 * allocating temporary storage, setting up constants, etc.
625 * After this, we can call tgsi_exec_machine_run() many times.
626 */
627void
628tgsi_exec_machine_bind_shader(
629   struct tgsi_exec_machine *mach,
630   const struct tgsi_token *tokens,
631   uint numSamplers,
632   struct tgsi_sampler **samplers)
633{
634   uint k;
635   struct tgsi_parse_context parse;
636   struct tgsi_full_instruction *instructions;
637   struct tgsi_full_declaration *declarations;
638   uint maxInstructions = 10, numInstructions = 0;
639   uint maxDeclarations = 10, numDeclarations = 0;
640
641#if 0
642   tgsi_dump(tokens, 0);
643#endif
644
645   util_init_math();
646
647   if (numSamplers) {
648      assert(samplers);
649   }
650
651   mach->Tokens = tokens;
652   mach->Samplers = samplers;
653
654   if (!tokens) {
655      /* unbind and free all */
656      if (mach->Declarations) {
657         FREE( mach->Declarations );
658      }
659      mach->Declarations = NULL;
660      mach->NumDeclarations = 0;
661
662      if (mach->Instructions) {
663         FREE( mach->Instructions );
664      }
665      mach->Instructions = NULL;
666      mach->NumInstructions = 0;
667
668      return;
669   }
670
671   k = tgsi_parse_init (&parse, mach->Tokens);
672   if (k != TGSI_PARSE_OK) {
673      debug_printf( "Problem parsing!\n" );
674      return;
675   }
676
677   mach->Processor = parse.FullHeader.Processor.Processor;
678   mach->ImmLimit = 0;
679
680   if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
681       !mach->UsedGeometryShader) {
682      struct tgsi_exec_vector *inputs;
683      struct tgsi_exec_vector *outputs;
684
685      inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
686                            TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
687                            16);
688
689      if (!inputs)
690         return;
691
692      outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
693                             TGSI_MAX_TOTAL_VERTICES, 16);
694
695      if (!outputs) {
696         align_free(inputs);
697         return;
698      }
699
700      align_free(mach->Inputs);
701      align_free(mach->Outputs);
702
703      mach->Inputs = inputs;
704      mach->Outputs = outputs;
705      mach->UsedGeometryShader = TRUE;
706   }
707
708   declarations = (struct tgsi_full_declaration *)
709      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
710
711   if (!declarations) {
712      return;
713   }
714
715   instructions = (struct tgsi_full_instruction *)
716      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
717
718   if (!instructions) {
719      FREE( declarations );
720      return;
721   }
722
723   while( !tgsi_parse_end_of_tokens( &parse ) ) {
724      uint i;
725
726      tgsi_parse_token( &parse );
727      switch( parse.FullToken.Token.Type ) {
728      case TGSI_TOKEN_TYPE_DECLARATION:
729         /* save expanded declaration */
730         if (numDeclarations == maxDeclarations) {
731            declarations = REALLOC(declarations,
732                                   maxDeclarations
733                                   * sizeof(struct tgsi_full_declaration),
734                                   (maxDeclarations + 10)
735                                   * sizeof(struct tgsi_full_declaration));
736            maxDeclarations += 10;
737         }
738         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
739            unsigned reg;
740            for (reg = parse.FullToken.FullDeclaration.Range.First;
741                 reg <= parse.FullToken.FullDeclaration.Range.Last;
742                 ++reg) {
743               ++mach->NumOutputs;
744            }
745         }
746         if (parse.FullToken.FullDeclaration.Declaration.File ==
747             TGSI_FILE_IMMEDIATE_ARRAY) {
748            unsigned reg;
749            struct tgsi_full_declaration *decl =
750               &parse.FullToken.FullDeclaration;
751            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
752            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
753               for( i = 0; i < 4; i++ ) {
754                  int idx = reg * 4 + i;
755                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
756               }
757            }
758         }
759         memcpy(declarations + numDeclarations,
760                &parse.FullToken.FullDeclaration,
761                sizeof(declarations[0]));
762         numDeclarations++;
763         break;
764
765      case TGSI_TOKEN_TYPE_IMMEDIATE:
766         {
767            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
768            assert( size <= 4 );
769            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
770
771            for( i = 0; i < size; i++ ) {
772               mach->Imms[mach->ImmLimit][i] =
773		  parse.FullToken.FullImmediate.u[i].Float;
774            }
775            mach->ImmLimit += 1;
776         }
777         break;
778
779      case TGSI_TOKEN_TYPE_INSTRUCTION:
780
781         /* save expanded instruction */
782         if (numInstructions == maxInstructions) {
783            instructions = REALLOC(instructions,
784                                   maxInstructions
785                                   * sizeof(struct tgsi_full_instruction),
786                                   (maxInstructions + 10)
787                                   * sizeof(struct tgsi_full_instruction));
788            maxInstructions += 10;
789         }
790
791         memcpy(instructions + numInstructions,
792                &parse.FullToken.FullInstruction,
793                sizeof(instructions[0]));
794
795         numInstructions++;
796         break;
797
798      case TGSI_TOKEN_TYPE_PROPERTY:
799         break;
800
801      default:
802         assert( 0 );
803      }
804   }
805   tgsi_parse_free (&parse);
806
807   if (mach->Declarations) {
808      FREE( mach->Declarations );
809   }
810   mach->Declarations = declarations;
811   mach->NumDeclarations = numDeclarations;
812
813   if (mach->Instructions) {
814      FREE( mach->Instructions );
815   }
816   mach->Instructions = instructions;
817   mach->NumInstructions = numInstructions;
818}
819
820
821struct tgsi_exec_machine *
822tgsi_exec_machine_create( void )
823{
824   struct tgsi_exec_machine *mach;
825   uint i;
826
827   mach = align_malloc( sizeof *mach, 16 );
828   if (!mach)
829      goto fail;
830
831   memset(mach, 0, sizeof(*mach));
832
833   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
834   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
835   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
836
837   mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
838   mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
839   if (!mach->Inputs || !mach->Outputs)
840      goto fail;
841
842   /* Setup constants needed by the SSE2 executor. */
843   for( i = 0; i < 4; i++ ) {
844      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
845      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
846      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
847      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
848      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
849      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
850      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
851      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
852      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
853      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
854   }
855
856#ifdef DEBUG
857   /* silence warnings */
858   (void) print_chan;
859   (void) print_temp;
860#endif
861
862   return mach;
863
864fail:
865   if (mach) {
866      align_free(mach->Inputs);
867      align_free(mach->Outputs);
868      align_free(mach);
869   }
870   return NULL;
871}
872
873
874void
875tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
876{
877   if (mach) {
878      if (mach->Instructions)
879         FREE(mach->Instructions);
880      if (mach->Declarations)
881         FREE(mach->Declarations);
882
883      align_free(mach->Inputs);
884      align_free(mach->Outputs);
885
886      align_free(mach);
887   }
888}
889
890static void
891micro_add(union tgsi_exec_channel *dst,
892          const union tgsi_exec_channel *src0,
893          const union tgsi_exec_channel *src1)
894{
895   dst->f[0] = src0->f[0] + src1->f[0];
896   dst->f[1] = src0->f[1] + src1->f[1];
897   dst->f[2] = src0->f[2] + src1->f[2];
898   dst->f[3] = src0->f[3] + src1->f[3];
899}
900
901static void
902micro_div(
903   union tgsi_exec_channel *dst,
904   const union tgsi_exec_channel *src0,
905   const union tgsi_exec_channel *src1 )
906{
907   if (src1->f[0] != 0) {
908      dst->f[0] = src0->f[0] / src1->f[0];
909   }
910   if (src1->f[1] != 0) {
911      dst->f[1] = src0->f[1] / src1->f[1];
912   }
913   if (src1->f[2] != 0) {
914      dst->f[2] = src0->f[2] / src1->f[2];
915   }
916   if (src1->f[3] != 0) {
917      dst->f[3] = src0->f[3] / src1->f[3];
918   }
919}
920
921static void
922micro_rcc(union tgsi_exec_channel *dst,
923          const union tgsi_exec_channel *src)
924{
925   uint i;
926
927   for (i = 0; i < 4; i++) {
928      float recip = 1.0f / src->f[i];
929
930      if (recip > 0.0f) {
931         if (recip > 1.884467e+019f) {
932            dst->f[i] = 1.884467e+019f;
933         }
934         else if (recip < 5.42101e-020f) {
935            dst->f[i] = 5.42101e-020f;
936         }
937         else {
938            dst->f[i] = recip;
939         }
940      }
941      else {
942         if (recip < -1.884467e+019f) {
943            dst->f[i] = -1.884467e+019f;
944         }
945         else if (recip > -5.42101e-020f) {
946            dst->f[i] = -5.42101e-020f;
947         }
948         else {
949            dst->f[i] = recip;
950         }
951      }
952   }
953}
954
955static void
956micro_lt(
957   union tgsi_exec_channel *dst,
958   const union tgsi_exec_channel *src0,
959   const union tgsi_exec_channel *src1,
960   const union tgsi_exec_channel *src2,
961   const union tgsi_exec_channel *src3 )
962{
963   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
964   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
965   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
966   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
967}
968
969static void
970micro_max(union tgsi_exec_channel *dst,
971          const union tgsi_exec_channel *src0,
972          const union tgsi_exec_channel *src1)
973{
974   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
975   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
976   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
977   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
978}
979
980static void
981micro_min(union tgsi_exec_channel *dst,
982          const union tgsi_exec_channel *src0,
983          const union tgsi_exec_channel *src1)
984{
985   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
986   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
987   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
988   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
989}
990
991static void
992micro_mul(union tgsi_exec_channel *dst,
993          const union tgsi_exec_channel *src0,
994          const union tgsi_exec_channel *src1)
995{
996   dst->f[0] = src0->f[0] * src1->f[0];
997   dst->f[1] = src0->f[1] * src1->f[1];
998   dst->f[2] = src0->f[2] * src1->f[2];
999   dst->f[3] = src0->f[3] * src1->f[3];
1000}
1001
1002static void
1003micro_neg(
1004   union tgsi_exec_channel *dst,
1005   const union tgsi_exec_channel *src )
1006{
1007   dst->f[0] = -src->f[0];
1008   dst->f[1] = -src->f[1];
1009   dst->f[2] = -src->f[2];
1010   dst->f[3] = -src->f[3];
1011}
1012
1013static void
1014micro_pow(
1015   union tgsi_exec_channel *dst,
1016   const union tgsi_exec_channel *src0,
1017   const union tgsi_exec_channel *src1 )
1018{
1019#if FAST_MATH
1020   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1021   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1022   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1023   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1024#else
1025   dst->f[0] = powf( src0->f[0], src1->f[0] );
1026   dst->f[1] = powf( src0->f[1], src1->f[1] );
1027   dst->f[2] = powf( src0->f[2], src1->f[2] );
1028   dst->f[3] = powf( src0->f[3], src1->f[3] );
1029#endif
1030}
1031
1032static void
1033micro_sub(union tgsi_exec_channel *dst,
1034          const union tgsi_exec_channel *src0,
1035          const union tgsi_exec_channel *src1)
1036{
1037   dst->f[0] = src0->f[0] - src1->f[0];
1038   dst->f[1] = src0->f[1] - src1->f[1];
1039   dst->f[2] = src0->f[2] - src1->f[2];
1040   dst->f[3] = src0->f[3] - src1->f[3];
1041}
1042
1043static void
1044fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1045                       const uint chan_index,
1046                       const uint file,
1047                       const uint swizzle,
1048                       const union tgsi_exec_channel *index,
1049                       const union tgsi_exec_channel *index2D,
1050                       union tgsi_exec_channel *chan)
1051{
1052   uint i;
1053
1054   assert(swizzle < 4);
1055
1056   switch (file) {
1057   case TGSI_FILE_CONSTANT:
1058      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1059         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1060         assert(mach->Consts[index2D->i[i]]);
1061
1062         if (index->i[i] < 0) {
1063            chan->u[i] = 0;
1064         } else {
1065            /* NOTE: copying the const value as a uint instead of float */
1066            const uint constbuf = index2D->i[i];
1067            const uint *buf = (const uint *)mach->Consts[constbuf];
1068            const int pos = index->i[i] * 4 + swizzle;
1069            /* const buffer bounds check */
1070            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1071               if (0) {
1072                  /* Debug: print warning */
1073                  static int count = 0;
1074                  if (count++ < 100)
1075                     debug_printf("TGSI Exec: const buffer index %d"
1076                                  " out of bounds\n", pos);
1077               }
1078               chan->u[i] = 0;
1079            }
1080            else
1081               chan->u[i] = buf[pos];
1082         }
1083      }
1084      break;
1085
1086   case TGSI_FILE_INPUT:
1087      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1088         /*
1089         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1090            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1091                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1092                         index2D->i[i], index->i[i]);
1093                         }*/
1094         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1095         assert(pos >= 0);
1096         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1097         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1098      }
1099      break;
1100
1101   case TGSI_FILE_SYSTEM_VALUE:
1102      /* XXX no swizzling at this point.  Will be needed if we put
1103       * gl_FragCoord, for example, in a sys value register.
1104       */
1105      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1106         chan->u[i] = mach->SystemValue[index->i[i]].u[i];
1107      }
1108      break;
1109
1110   case TGSI_FILE_TEMPORARY:
1111      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1112         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1113         assert(index2D->i[i] == 0);
1114
1115         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1116      }
1117      break;
1118
1119   case TGSI_FILE_TEMPORARY_ARRAY:
1120      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1121         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1122         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1123
1124         chan->u[i] =
1125            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1126      }
1127      break;
1128
1129   case TGSI_FILE_IMMEDIATE:
1130      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1131         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1132         assert(index2D->i[i] == 0);
1133
1134         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1135      }
1136      break;
1137
1138   case TGSI_FILE_IMMEDIATE_ARRAY:
1139      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1140         assert(index2D->i[i] == 0);
1141
1142         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1143      }
1144      break;
1145
1146   case TGSI_FILE_ADDRESS:
1147      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1148         assert(index->i[i] >= 0);
1149         assert(index2D->i[i] == 0);
1150
1151         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1152      }
1153      break;
1154
1155   case TGSI_FILE_PREDICATE:
1156      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1157         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1158         assert(index2D->i[i] == 0);
1159
1160         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1161      }
1162      break;
1163
1164   case TGSI_FILE_OUTPUT:
1165      /* vertex/fragment output vars can be read too */
1166      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1167         assert(index->i[i] >= 0);
1168         assert(index2D->i[i] == 0);
1169
1170         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1171      }
1172      break;
1173
1174   default:
1175      assert(0);
1176      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1177         chan->u[i] = 0;
1178      }
1179   }
1180}
1181
1182static void
1183fetch_source(const struct tgsi_exec_machine *mach,
1184             union tgsi_exec_channel *chan,
1185             const struct tgsi_full_src_register *reg,
1186             const uint chan_index,
1187             enum tgsi_exec_datatype src_datatype)
1188{
1189   union tgsi_exec_channel index;
1190   union tgsi_exec_channel index2D;
1191   uint swizzle;
1192
1193   /* We start with a direct index into a register file.
1194    *
1195    *    file[1],
1196    *    where:
1197    *       file = Register.File
1198    *       [1] = Register.Index
1199    */
1200   index.i[0] =
1201   index.i[1] =
1202   index.i[2] =
1203   index.i[3] = reg->Register.Index;
1204
1205   /* There is an extra source register that indirectly subscripts
1206    * a register file. The direct index now becomes an offset
1207    * that is being added to the indirect register.
1208    *
1209    *    file[ind[2].x+1],
1210    *    where:
1211    *       ind = Indirect.File
1212    *       [2] = Indirect.Index
1213    *       .x = Indirect.SwizzleX
1214    */
1215   if (reg->Register.Indirect) {
1216      union tgsi_exec_channel index2;
1217      union tgsi_exec_channel indir_index;
1218      const uint execmask = mach->ExecMask;
1219      uint i;
1220
1221      /* which address register (always zero now) */
1222      index2.i[0] =
1223      index2.i[1] =
1224      index2.i[2] =
1225      index2.i[3] = reg->Indirect.Index;
1226      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1227      /* get current value of address register[swizzle] */
1228      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, TGSI_CHAN_X );
1229      fetch_src_file_channel(mach,
1230                             chan_index,
1231                             reg->Indirect.File,
1232                             swizzle,
1233                             &index2,
1234                             &ZeroVec,
1235                             &indir_index);
1236
1237      /* add value of address register to the offset */
1238      index.i[0] += indir_index.i[0];
1239      index.i[1] += indir_index.i[1];
1240      index.i[2] += indir_index.i[2];
1241      index.i[3] += indir_index.i[3];
1242
1243      /* for disabled execution channels, zero-out the index to
1244       * avoid using a potential garbage value.
1245       */
1246      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1247         if ((execmask & (1 << i)) == 0)
1248            index.i[i] = 0;
1249      }
1250   }
1251
1252   /* There is an extra source register that is a second
1253    * subscript to a register file. Effectively it means that
1254    * the register file is actually a 2D array of registers.
1255    *
1256    *    file[3][1],
1257    *    where:
1258    *       [3] = Dimension.Index
1259    */
1260   if (reg->Register.Dimension) {
1261      index2D.i[0] =
1262      index2D.i[1] =
1263      index2D.i[2] =
1264      index2D.i[3] = reg->Dimension.Index;
1265
1266      /* Again, the second subscript index can be addressed indirectly
1267       * identically to the first one.
1268       * Nothing stops us from indirectly addressing the indirect register,
1269       * but there is no need for that, so we won't exercise it.
1270       *
1271       *    file[ind[4].y+3][1],
1272       *    where:
1273       *       ind = DimIndirect.File
1274       *       [4] = DimIndirect.Index
1275       *       .y = DimIndirect.SwizzleX
1276       */
1277      if (reg->Dimension.Indirect) {
1278         union tgsi_exec_channel index2;
1279         union tgsi_exec_channel indir_index;
1280         const uint execmask = mach->ExecMask;
1281         uint i;
1282
1283         index2.i[0] =
1284         index2.i[1] =
1285         index2.i[2] =
1286         index2.i[3] = reg->DimIndirect.Index;
1287
1288         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, TGSI_CHAN_X );
1289         fetch_src_file_channel(mach,
1290                                chan_index,
1291                                reg->DimIndirect.File,
1292                                swizzle,
1293                                &index2,
1294                                &ZeroVec,
1295                                &indir_index);
1296
1297         index2D.i[0] += indir_index.i[0];
1298         index2D.i[1] += indir_index.i[1];
1299         index2D.i[2] += indir_index.i[2];
1300         index2D.i[3] += indir_index.i[3];
1301
1302         /* for disabled execution channels, zero-out the index to
1303          * avoid using a potential garbage value.
1304          */
1305         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1306            if ((execmask & (1 << i)) == 0) {
1307               index2D.i[i] = 0;
1308            }
1309         }
1310      }
1311
1312      /* If by any chance there was a need for a 3D array of register
1313       * files, we would have to check whether Dimension is followed
1314       * by a dimension register and continue the saga.
1315       */
1316   } else {
1317      index2D.i[0] =
1318      index2D.i[1] =
1319      index2D.i[2] =
1320      index2D.i[3] = 0;
1321   }
1322
1323   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1324   fetch_src_file_channel(mach,
1325                          chan_index,
1326                          reg->Register.File,
1327                          swizzle,
1328                          &index,
1329                          &index2D,
1330                          chan);
1331
1332   if (reg->Register.Absolute) {
1333      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1334         micro_abs(chan, chan);
1335      } else {
1336         micro_iabs(chan, chan);
1337      }
1338   }
1339
1340   if (reg->Register.Negate) {
1341      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1342         micro_neg(chan, chan);
1343      } else {
1344         micro_ineg(chan, chan);
1345      }
1346   }
1347}
1348
1349static void
1350store_dest(struct tgsi_exec_machine *mach,
1351           const union tgsi_exec_channel *chan,
1352           const struct tgsi_full_dst_register *reg,
1353           const struct tgsi_full_instruction *inst,
1354           uint chan_index,
1355           enum tgsi_exec_datatype dst_datatype)
1356{
1357   uint i;
1358   union tgsi_exec_channel null;
1359   union tgsi_exec_channel *dst;
1360   union tgsi_exec_channel index2D;
1361   uint execmask = mach->ExecMask;
1362   int offset = 0;  /* indirection offset */
1363   int index;
1364
1365   /* for debugging */
1366   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1367      check_inf_or_nan(chan);
1368   }
1369
1370   /* There is an extra source register that indirectly subscripts
1371    * a register file. The direct index now becomes an offset
1372    * that is being added to the indirect register.
1373    *
1374    *    file[ind[2].x+1],
1375    *    where:
1376    *       ind = Indirect.File
1377    *       [2] = Indirect.Index
1378    *       .x = Indirect.SwizzleX
1379    */
1380   if (reg->Register.Indirect) {
1381      union tgsi_exec_channel index;
1382      union tgsi_exec_channel indir_index;
1383      uint swizzle;
1384
1385      /* which address register (always zero for now) */
1386      index.i[0] =
1387      index.i[1] =
1388      index.i[2] =
1389      index.i[3] = reg->Indirect.Index;
1390
1391      /* get current value of address register[swizzle] */
1392      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, TGSI_CHAN_X );
1393
1394      /* fetch values from the address/indirection register */
1395      fetch_src_file_channel(mach,
1396                             chan_index,
1397                             reg->Indirect.File,
1398                             swizzle,
1399                             &index,
1400                             &ZeroVec,
1401                             &indir_index);
1402
1403      /* save indirection offset */
1404      offset = indir_index.i[0];
1405   }
1406
1407   /* There is an extra source register that is a second
1408    * subscript to a register file. Effectively it means that
1409    * the register file is actually a 2D array of registers.
1410    *
1411    *    file[3][1],
1412    *    where:
1413    *       [3] = Dimension.Index
1414    */
1415   if (reg->Register.Dimension) {
1416      index2D.i[0] =
1417      index2D.i[1] =
1418      index2D.i[2] =
1419      index2D.i[3] = reg->Dimension.Index;
1420
1421      /* Again, the second subscript index can be addressed indirectly
1422       * identically to the first one.
1423       * Nothing stops us from indirectly addressing the indirect register,
1424       * but there is no need for that, so we won't exercise it.
1425       *
1426       *    file[ind[4].y+3][1],
1427       *    where:
1428       *       ind = DimIndirect.File
1429       *       [4] = DimIndirect.Index
1430       *       .y = DimIndirect.SwizzleX
1431       */
1432      if (reg->Dimension.Indirect) {
1433         union tgsi_exec_channel index2;
1434         union tgsi_exec_channel indir_index;
1435         const uint execmask = mach->ExecMask;
1436         unsigned swizzle;
1437         uint i;
1438
1439         index2.i[0] =
1440         index2.i[1] =
1441         index2.i[2] =
1442         index2.i[3] = reg->DimIndirect.Index;
1443
1444         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, TGSI_CHAN_X );
1445         fetch_src_file_channel(mach,
1446                                chan_index,
1447                                reg->DimIndirect.File,
1448                                swizzle,
1449                                &index2,
1450                                &ZeroVec,
1451                                &indir_index);
1452
1453         index2D.i[0] += indir_index.i[0];
1454         index2D.i[1] += indir_index.i[1];
1455         index2D.i[2] += indir_index.i[2];
1456         index2D.i[3] += indir_index.i[3];
1457
1458         /* for disabled execution channels, zero-out the index to
1459          * avoid using a potential garbage value.
1460          */
1461         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1462            if ((execmask & (1 << i)) == 0) {
1463               index2D.i[i] = 0;
1464            }
1465         }
1466      }
1467
1468      /* If by any chance there was a need for a 3D array of register
1469       * files, we would have to check whether Dimension is followed
1470       * by a dimension register and continue the saga.
1471       */
1472   } else {
1473      index2D.i[0] =
1474      index2D.i[1] =
1475      index2D.i[2] =
1476      index2D.i[3] = 0;
1477   }
1478
1479   switch (reg->Register.File) {
1480   case TGSI_FILE_NULL:
1481      dst = &null;
1482      break;
1483
1484   case TGSI_FILE_OUTPUT:
1485      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1486         + reg->Register.Index;
1487      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1488#if 0
1489      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1490         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1491         for (i = 0; i < TGSI_QUAD_SIZE; i++)
1492            if (execmask & (1 << i))
1493               fprintf(stderr, "%f, ", chan->f[i]);
1494         fprintf(stderr, ")\n");
1495      }
1496#endif
1497      break;
1498
1499   case TGSI_FILE_TEMPORARY:
1500      index = reg->Register.Index;
1501      assert( index < TGSI_EXEC_NUM_TEMPS );
1502      dst = &mach->Temps[offset + index].xyzw[chan_index];
1503      break;
1504
1505   case TGSI_FILE_TEMPORARY_ARRAY:
1506      index = reg->Register.Index;
1507      assert( index < TGSI_EXEC_NUM_TEMPS );
1508      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1509      /* XXX we use index2D.i[0] here but somehow we might
1510       * end up with someone trying to store indirectly in
1511       * different buffers */
1512      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1513      break;
1514
1515   case TGSI_FILE_ADDRESS:
1516      index = reg->Register.Index;
1517      dst = &mach->Addrs[index].xyzw[chan_index];
1518      break;
1519
1520   case TGSI_FILE_PREDICATE:
1521      index = reg->Register.Index;
1522      assert(index < TGSI_EXEC_NUM_PREDS);
1523      dst = &mach->Predicates[index].xyzw[chan_index];
1524      break;
1525
1526   default:
1527      assert( 0 );
1528      return;
1529   }
1530
1531   if (inst->Instruction.Predicate) {
1532      uint swizzle;
1533      union tgsi_exec_channel *pred;
1534
1535      switch (chan_index) {
1536      case TGSI_CHAN_X:
1537         swizzle = inst->Predicate.SwizzleX;
1538         break;
1539      case TGSI_CHAN_Y:
1540         swizzle = inst->Predicate.SwizzleY;
1541         break;
1542      case TGSI_CHAN_Z:
1543         swizzle = inst->Predicate.SwizzleZ;
1544         break;
1545      case TGSI_CHAN_W:
1546         swizzle = inst->Predicate.SwizzleW;
1547         break;
1548      default:
1549         assert(0);
1550         return;
1551      }
1552
1553      assert(inst->Predicate.Index == 0);
1554
1555      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1556
1557      if (inst->Predicate.Negate) {
1558         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1559            if (pred->u[i]) {
1560               execmask &= ~(1 << i);
1561            }
1562         }
1563      } else {
1564         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1565            if (!pred->u[i]) {
1566               execmask &= ~(1 << i);
1567            }
1568         }
1569      }
1570   }
1571
1572   switch (inst->Instruction.Saturate) {
1573   case TGSI_SAT_NONE:
1574      for (i = 0; i < TGSI_QUAD_SIZE; i++)
1575         if (execmask & (1 << i))
1576            dst->i[i] = chan->i[i];
1577      break;
1578
1579   case TGSI_SAT_ZERO_ONE:
1580      for (i = 0; i < TGSI_QUAD_SIZE; i++)
1581         if (execmask & (1 << i)) {
1582            if (chan->f[i] < 0.0f)
1583               dst->f[i] = 0.0f;
1584            else if (chan->f[i] > 1.0f)
1585               dst->f[i] = 1.0f;
1586            else
1587               dst->i[i] = chan->i[i];
1588         }
1589      break;
1590
1591   case TGSI_SAT_MINUS_PLUS_ONE:
1592      for (i = 0; i < TGSI_QUAD_SIZE; i++)
1593         if (execmask & (1 << i)) {
1594            if (chan->f[i] < -1.0f)
1595               dst->f[i] = -1.0f;
1596            else if (chan->f[i] > 1.0f)
1597               dst->f[i] = 1.0f;
1598            else
1599               dst->i[i] = chan->i[i];
1600         }
1601      break;
1602
1603   default:
1604      assert( 0 );
1605   }
1606}
1607
1608#define FETCH(VAL,INDEX,CHAN)\
1609    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1610
1611#define IFETCH(VAL,INDEX,CHAN)\
1612    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1613
1614
1615/**
1616 * Execute ARB-style KIL which is predicated by a src register.
1617 * Kill fragment if any of the four values is less than zero.
1618 */
1619static void
1620exec_kil(struct tgsi_exec_machine *mach,
1621         const struct tgsi_full_instruction *inst)
1622{
1623   uint uniquemask;
1624   uint chan_index;
1625   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1626   union tgsi_exec_channel r[1];
1627
1628   /* This mask stores component bits that were already tested. */
1629   uniquemask = 0;
1630
1631   for (chan_index = 0; chan_index < 4; chan_index++)
1632   {
1633      uint swizzle;
1634      uint i;
1635
1636      /* unswizzle channel */
1637      swizzle = tgsi_util_get_full_src_register_swizzle (
1638                        &inst->Src[0],
1639                        chan_index);
1640
1641      /* check if the component has not been already tested */
1642      if (uniquemask & (1 << swizzle))
1643         continue;
1644      uniquemask |= 1 << swizzle;
1645
1646      FETCH(&r[0], 0, chan_index);
1647      for (i = 0; i < 4; i++)
1648         if (r[0].f[i] < 0.0f)
1649            kilmask |= 1 << i;
1650   }
1651
1652   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1653}
1654
1655/**
1656 * Execute NVIDIA-style KIL which is predicated by a condition code.
1657 * Kill fragment if the condition code is TRUE.
1658 */
1659static void
1660exec_kilp(struct tgsi_exec_machine *mach,
1661          const struct tgsi_full_instruction *inst)
1662{
1663   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1664
1665   /* "unconditional" kil */
1666   kilmask = mach->ExecMask;
1667   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1668}
1669
1670static void
1671emit_vertex(struct tgsi_exec_machine *mach)
1672{
1673   /* FIXME: check for exec mask correctly
1674   unsigned i;
1675   for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
1676         if ((mach->ExecMask & (1 << i)))
1677   */
1678   if (mach->ExecMask) {
1679      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1680      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1681   }
1682}
1683
1684static void
1685emit_primitive(struct tgsi_exec_machine *mach)
1686{
1687   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1688   /* FIXME: check for exec mask correctly
1689   unsigned i;
1690   for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
1691         if ((mach->ExecMask & (1 << i)))
1692   */
1693   if (mach->ExecMask) {
1694      ++(*prim_count);
1695      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1696      mach->Primitives[*prim_count] = 0;
1697   }
1698}
1699
1700static void
1701conditional_emit_primitive(struct tgsi_exec_machine *mach)
1702{
1703   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1704      int emitted_verts =
1705         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1706      if (emitted_verts) {
1707         emit_primitive(mach);
1708      }
1709   }
1710}
1711
1712
1713/*
1714 * Fetch four texture samples using STR texture coordinates.
1715 */
1716static void
1717fetch_texel( struct tgsi_sampler *sampler,
1718             const union tgsi_exec_channel *s,
1719             const union tgsi_exec_channel *t,
1720             const union tgsi_exec_channel *p,
1721             const union tgsi_exec_channel *c0,
1722             enum tgsi_sampler_control control,
1723             union tgsi_exec_channel *r,
1724             union tgsi_exec_channel *g,
1725             union tgsi_exec_channel *b,
1726             union tgsi_exec_channel *a )
1727{
1728   uint j;
1729   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
1730
1731   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1732
1733   for (j = 0; j < 4; j++) {
1734      r->f[j] = rgba[0][j];
1735      g->f[j] = rgba[1][j];
1736      b->f[j] = rgba[2][j];
1737      a->f[j] = rgba[3][j];
1738   }
1739}
1740
1741
1742#define TEX_MODIFIER_NONE           0
1743#define TEX_MODIFIER_PROJECTED      1
1744#define TEX_MODIFIER_LOD_BIAS       2
1745#define TEX_MODIFIER_EXPLICIT_LOD   3
1746
1747
1748static void
1749exec_tex(struct tgsi_exec_machine *mach,
1750         const struct tgsi_full_instruction *inst,
1751         uint modifier)
1752{
1753   const uint unit = inst->Src[1].Register.Index;
1754   union tgsi_exec_channel r[4];
1755   const union tgsi_exec_channel *lod = &ZeroVec;
1756   enum tgsi_sampler_control control;
1757   uint chan;
1758
1759   if (modifier != TEX_MODIFIER_NONE) {
1760      FETCH(&r[3], 0, TGSI_CHAN_W);
1761      if (modifier != TEX_MODIFIER_PROJECTED) {
1762         lod = &r[3];
1763      }
1764   }
1765
1766   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1767      control = tgsi_sampler_lod_explicit;
1768   } else {
1769      control = tgsi_sampler_lod_bias;
1770   }
1771
1772   switch (inst->Texture.Texture) {
1773   case TGSI_TEXTURE_1D:
1774      FETCH(&r[0], 0, TGSI_CHAN_X);
1775
1776      if (modifier == TEX_MODIFIER_PROJECTED) {
1777         micro_div(&r[0], &r[0], &r[3]);
1778      }
1779
1780      fetch_texel(mach->Samplers[unit],
1781                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1782                  control,
1783                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1784      break;
1785   case TGSI_TEXTURE_SHADOW1D:
1786      FETCH(&r[0], 0, TGSI_CHAN_X);
1787      FETCH(&r[2], 0, TGSI_CHAN_Z);
1788
1789      if (modifier == TEX_MODIFIER_PROJECTED) {
1790         micro_div(&r[0], &r[0], &r[3]);
1791      }
1792
1793      fetch_texel(mach->Samplers[unit],
1794                  &r[0], &ZeroVec, &r[2], lod,  /* S, T, P, LOD */
1795                  control,
1796                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1797      break;
1798
1799   case TGSI_TEXTURE_2D:
1800   case TGSI_TEXTURE_RECT:
1801   case TGSI_TEXTURE_SHADOW2D:
1802   case TGSI_TEXTURE_SHADOWRECT:
1803      FETCH(&r[0], 0, TGSI_CHAN_X);
1804      FETCH(&r[1], 0, TGSI_CHAN_Y);
1805      FETCH(&r[2], 0, TGSI_CHAN_Z);
1806
1807      if (modifier == TEX_MODIFIER_PROJECTED) {
1808         micro_div(&r[0], &r[0], &r[3]);
1809         micro_div(&r[1], &r[1], &r[3]);
1810         micro_div(&r[2], &r[2], &r[3]);
1811      }
1812
1813      fetch_texel(mach->Samplers[unit],
1814                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1815                  control,
1816                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1817      break;
1818
1819   case TGSI_TEXTURE_1D_ARRAY:
1820      FETCH(&r[0], 0, TGSI_CHAN_X);
1821      FETCH(&r[1], 0, TGSI_CHAN_Y);
1822
1823      if (modifier == TEX_MODIFIER_PROJECTED) {
1824         micro_div(&r[0], &r[0], &r[3]);
1825      }
1826
1827      fetch_texel(mach->Samplers[unit],
1828                  &r[0], &r[1], &ZeroVec, lod,     /* S, T, P, LOD */
1829                  control,
1830                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1831      break;
1832   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1833      FETCH(&r[0], 0, TGSI_CHAN_X);
1834      FETCH(&r[1], 0, TGSI_CHAN_Y);
1835      FETCH(&r[2], 0, TGSI_CHAN_Z);
1836
1837      if (modifier == TEX_MODIFIER_PROJECTED) {
1838         micro_div(&r[0], &r[0], &r[3]);
1839      }
1840
1841      fetch_texel(mach->Samplers[unit],
1842                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1843                  control,
1844                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1845      break;
1846
1847   case TGSI_TEXTURE_2D_ARRAY:
1848      FETCH(&r[0], 0, TGSI_CHAN_X);
1849      FETCH(&r[1], 0, TGSI_CHAN_Y);
1850      FETCH(&r[2], 0, TGSI_CHAN_Z);
1851
1852      if (modifier == TEX_MODIFIER_PROJECTED) {
1853         micro_div(&r[0], &r[0], &r[3]);
1854         micro_div(&r[1], &r[1], &r[3]);
1855      }
1856
1857      fetch_texel(mach->Samplers[unit],
1858                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1859                  control,
1860                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1861      break;
1862   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1863   case TGSI_TEXTURE_SHADOWCUBE:
1864      FETCH(&r[0], 0, TGSI_CHAN_X);
1865      FETCH(&r[1], 0, TGSI_CHAN_Y);
1866      FETCH(&r[2], 0, TGSI_CHAN_Z);
1867      FETCH(&r[3], 0, TGSI_CHAN_W);
1868
1869      fetch_texel(mach->Samplers[unit],
1870                  &r[0], &r[1], &r[2], &r[3],     /* S, T, P, LOD */
1871                  control,
1872                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1873      break;
1874   case TGSI_TEXTURE_3D:
1875   case TGSI_TEXTURE_CUBE:
1876      FETCH(&r[0], 0, TGSI_CHAN_X);
1877      FETCH(&r[1], 0, TGSI_CHAN_Y);
1878      FETCH(&r[2], 0, TGSI_CHAN_Z);
1879
1880      if (modifier == TEX_MODIFIER_PROJECTED) {
1881         micro_div(&r[0], &r[0], &r[3]);
1882         micro_div(&r[1], &r[1], &r[3]);
1883         micro_div(&r[2], &r[2], &r[3]);
1884      }
1885
1886      fetch_texel(mach->Samplers[unit],
1887                  &r[0], &r[1], &r[2], lod,
1888                  control,
1889                  &r[0], &r[1], &r[2], &r[3]);
1890      break;
1891
1892   default:
1893      assert(0);
1894   }
1895
1896#if 0
1897   debug_printf("fetch r: %g %g %g %g\n",
1898         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
1899   debug_printf("fetch g: %g %g %g %g\n",
1900         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
1901   debug_printf("fetch b: %g %g %g %g\n",
1902         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
1903   debug_printf("fetch a: %g %g %g %g\n",
1904         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
1905#endif
1906
1907   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1908      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1909         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1910      }
1911   }
1912}
1913
1914static void
1915exec_txd(struct tgsi_exec_machine *mach,
1916         const struct tgsi_full_instruction *inst)
1917{
1918   const uint unit = inst->Src[3].Register.Index;
1919   union tgsi_exec_channel r[4];
1920   uint chan;
1921
1922   /*
1923    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1924    */
1925
1926   switch (inst->Texture.Texture) {
1927   case TGSI_TEXTURE_1D:
1928   case TGSI_TEXTURE_SHADOW1D:
1929
1930      FETCH(&r[0], 0, TGSI_CHAN_X);
1931
1932      fetch_texel(mach->Samplers[unit],
1933                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1934                  tgsi_sampler_lod_bias,
1935                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1936      break;
1937
1938   case TGSI_TEXTURE_1D_ARRAY:
1939   case TGSI_TEXTURE_2D:
1940   case TGSI_TEXTURE_RECT:
1941   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1942   case TGSI_TEXTURE_SHADOW2D:
1943   case TGSI_TEXTURE_SHADOWRECT:
1944
1945      FETCH(&r[0], 0, TGSI_CHAN_X);
1946      FETCH(&r[1], 0, TGSI_CHAN_Y);
1947      FETCH(&r[2], 0, TGSI_CHAN_Z);
1948
1949      fetch_texel(mach->Samplers[unit],
1950                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1951                  tgsi_sampler_lod_bias,
1952                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1953      break;
1954
1955   case TGSI_TEXTURE_2D_ARRAY:
1956   case TGSI_TEXTURE_3D:
1957   case TGSI_TEXTURE_CUBE:
1958
1959      FETCH(&r[0], 0, TGSI_CHAN_X);
1960      FETCH(&r[1], 0, TGSI_CHAN_Y);
1961      FETCH(&r[2], 0, TGSI_CHAN_Z);
1962
1963      fetch_texel(mach->Samplers[unit],
1964                  &r[0], &r[1], &r[2], &ZeroVec,
1965                  tgsi_sampler_lod_bias,
1966                  &r[0], &r[1], &r[2], &r[3]);
1967      break;
1968
1969   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1970
1971      FETCH(&r[0], 0, TGSI_CHAN_X);
1972      FETCH(&r[1], 0, TGSI_CHAN_Y);
1973      FETCH(&r[2], 0, TGSI_CHAN_Z);
1974      FETCH(&r[3], 0, TGSI_CHAN_W);
1975
1976      fetch_texel(mach->Samplers[unit],
1977                  &r[0], &r[1], &r[2], &r[3],
1978                  tgsi_sampler_lod_bias,
1979                  &r[0], &r[1], &r[2], &r[3]);
1980      break;
1981
1982   default:
1983      assert(0);
1984   }
1985
1986   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1987      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1988         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1989      }
1990   }
1991}
1992
1993
1994static void
1995exec_txf(struct tgsi_exec_machine *mach,
1996	 const struct tgsi_full_instruction *inst)
1997{
1998   struct tgsi_sampler *sampler;
1999   const uint unit = inst->Src[2].Register.Index;
2000   union tgsi_exec_channel r[4];
2001   union tgsi_exec_channel offset[3];
2002   uint chan;
2003   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2004   int j;
2005   int8_t offsets[3];
2006
2007   if (inst->Texture.NumOffsets == 1) {
2008      union tgsi_exec_channel index;
2009      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2010      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2011                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2012      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2013                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2014      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2015                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2016     offsets[0] = offset[0].i[0];
2017     offsets[1] = offset[1].i[0];
2018     offsets[2] = offset[2].i[0];
2019   } else
2020     offsets[0] = offsets[1] = offsets[2] = 0;
2021
2022   IFETCH(&r[3], 0, TGSI_CHAN_W);
2023
2024   switch(inst->Texture.Texture) {
2025   case TGSI_TEXTURE_3D:
2026   case TGSI_TEXTURE_2D_ARRAY:
2027   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2028      IFETCH(&r[2], 0, TGSI_CHAN_Z);
2029      /* fallthrough */
2030   case TGSI_TEXTURE_2D:
2031   case TGSI_TEXTURE_RECT:
2032   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2033   case TGSI_TEXTURE_SHADOW2D:
2034   case TGSI_TEXTURE_SHADOWRECT:
2035   case TGSI_TEXTURE_1D_ARRAY:
2036      IFETCH(&r[1], 0, TGSI_CHAN_Y);
2037      /* fallthrough */
2038   case TGSI_TEXTURE_1D:
2039   case TGSI_TEXTURE_SHADOW1D:
2040      IFETCH(&r[0], 0, TGSI_CHAN_X);
2041      break;
2042   default:
2043      assert(0);
2044      break;
2045   }
2046
2047   sampler = mach->Samplers[unit];
2048   sampler->get_texel(sampler, r[0].i, r[1].i, r[2].i, r[3].i,
2049		      offsets, rgba);
2050
2051   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2052      r[0].f[j] = rgba[0][j];
2053      r[1].f[j] = rgba[1][j];
2054      r[2].f[j] = rgba[2][j];
2055      r[3].f[j] = rgba[3][j];
2056   }
2057
2058   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2059      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2060         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2061      }
2062   }
2063}
2064
2065static void
2066exec_txq(struct tgsi_exec_machine *mach,
2067         const struct tgsi_full_instruction *inst)
2068{
2069   struct tgsi_sampler *sampler;
2070   const uint unit = inst->Src[1].Register.Index;
2071   int result[4];
2072   union tgsi_exec_channel r[4], src;
2073   uint chan;
2074   int i,j;
2075
2076   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2077   sampler = mach->Samplers[unit];
2078
2079   sampler->get_dims(sampler, src.i[0], result);
2080
2081   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2082      for (j = 0; j < 4; j++) {
2083	 r[j].i[i] = result[j];
2084      }
2085   }
2086
2087   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2088      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2089	 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2090		    TGSI_EXEC_DATA_INT);
2091      }
2092   }
2093}
2094
2095static void
2096exec_sample(struct tgsi_exec_machine *mach,
2097            const struct tgsi_full_instruction *inst,
2098            uint modifier)
2099{
2100   const uint resource_unit = inst->Src[1].Register.Index;
2101   const uint sampler_unit = inst->Src[2].Register.Index;
2102   union tgsi_exec_channel r[4];
2103   const union tgsi_exec_channel *lod = &ZeroVec;
2104   enum tgsi_sampler_control control;
2105   uint chan;
2106
2107   if (modifier != TEX_MODIFIER_NONE) {
2108      if (modifier == TEX_MODIFIER_LOD_BIAS)
2109         FETCH(&r[3], 3, TGSI_CHAN_X);
2110      else /*TEX_MODIFIER_LOD*/
2111         FETCH(&r[3], 0, TGSI_CHAN_W);
2112
2113      if (modifier != TEX_MODIFIER_PROJECTED) {
2114         lod = &r[3];
2115      }
2116   }
2117
2118   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2119      control = tgsi_sampler_lod_explicit;
2120   } else {
2121      control = tgsi_sampler_lod_bias;
2122   }
2123
2124   switch (mach->SamplerViews[resource_unit].Resource) {
2125   case TGSI_TEXTURE_1D:
2126   case TGSI_TEXTURE_SHADOW1D:
2127      FETCH(&r[0], 0, TGSI_CHAN_X);
2128
2129      if (modifier == TEX_MODIFIER_PROJECTED) {
2130         micro_div(&r[0], &r[0], &r[3]);
2131      }
2132
2133      fetch_texel(mach->Samplers[sampler_unit],
2134                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
2135                  control,
2136                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2137      break;
2138
2139   case TGSI_TEXTURE_1D_ARRAY:
2140   case TGSI_TEXTURE_2D:
2141   case TGSI_TEXTURE_RECT:
2142   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2143   case TGSI_TEXTURE_SHADOW2D:
2144   case TGSI_TEXTURE_SHADOWRECT:
2145      FETCH(&r[0], 0, TGSI_CHAN_X);
2146      FETCH(&r[1], 0, TGSI_CHAN_Y);
2147      FETCH(&r[2], 0, TGSI_CHAN_Z);
2148
2149      if (modifier == TEX_MODIFIER_PROJECTED) {
2150         micro_div(&r[0], &r[0], &r[3]);
2151         micro_div(&r[1], &r[1], &r[3]);
2152         micro_div(&r[2], &r[2], &r[3]);
2153      }
2154
2155      fetch_texel(mach->Samplers[sampler_unit],
2156                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
2157                  control,
2158                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2159      break;
2160
2161   case TGSI_TEXTURE_2D_ARRAY:
2162   case TGSI_TEXTURE_3D:
2163   case TGSI_TEXTURE_CUBE:
2164      FETCH(&r[0], 0, TGSI_CHAN_X);
2165      FETCH(&r[1], 0, TGSI_CHAN_Y);
2166      FETCH(&r[2], 0, TGSI_CHAN_Z);
2167
2168      if (modifier == TEX_MODIFIER_PROJECTED) {
2169         micro_div(&r[0], &r[0], &r[3]);
2170         micro_div(&r[1], &r[1], &r[3]);
2171         micro_div(&r[2], &r[2], &r[3]);
2172      }
2173
2174      fetch_texel(mach->Samplers[sampler_unit],
2175                  &r[0], &r[1], &r[2], lod,
2176                  control,
2177                  &r[0], &r[1], &r[2], &r[3]);
2178      break;
2179
2180   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2181   case TGSI_TEXTURE_SHADOWCUBE:
2182      FETCH(&r[0], 0, TGSI_CHAN_X);
2183      FETCH(&r[1], 0, TGSI_CHAN_Y);
2184      FETCH(&r[2], 0, TGSI_CHAN_Z);
2185      FETCH(&r[3], 0, TGSI_CHAN_W);
2186
2187      assert(modifier != TEX_MODIFIER_PROJECTED);
2188
2189      fetch_texel(mach->Samplers[sampler_unit],
2190                  &r[0], &r[1], &r[2], &r[3],
2191                  control,
2192                  &r[0], &r[1], &r[2], &r[3]);
2193      break;
2194
2195   default:
2196      assert(0);
2197   }
2198
2199   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2200      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2201         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2202      }
2203   }
2204}
2205
2206static void
2207exec_sample_d(struct tgsi_exec_machine *mach,
2208              const struct tgsi_full_instruction *inst)
2209{
2210   const uint resource_unit = inst->Src[1].Register.Index;
2211   const uint sampler_unit = inst->Src[2].Register.Index;
2212   union tgsi_exec_channel r[4];
2213   uint chan;
2214   /*
2215    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
2216    */
2217
2218   switch (mach->SamplerViews[resource_unit].Resource) {
2219   case TGSI_TEXTURE_1D:
2220   case TGSI_TEXTURE_SHADOW1D:
2221
2222      FETCH(&r[0], 0, TGSI_CHAN_X);
2223
2224      fetch_texel(mach->Samplers[sampler_unit],
2225                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
2226                  tgsi_sampler_lod_bias,
2227                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2228      break;
2229
2230   case TGSI_TEXTURE_2D:
2231   case TGSI_TEXTURE_RECT:
2232   case TGSI_TEXTURE_SHADOW2D:
2233   case TGSI_TEXTURE_SHADOWRECT:
2234
2235      FETCH(&r[0], 0, TGSI_CHAN_X);
2236      FETCH(&r[1], 0, TGSI_CHAN_Y);
2237      FETCH(&r[2], 0, TGSI_CHAN_Z);
2238
2239      fetch_texel(mach->Samplers[sampler_unit],
2240                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2241                  tgsi_sampler_lod_bias,
2242                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2243      break;
2244
2245   case TGSI_TEXTURE_3D:
2246   case TGSI_TEXTURE_CUBE:
2247
2248      FETCH(&r[0], 0, TGSI_CHAN_X);
2249      FETCH(&r[1], 0, TGSI_CHAN_Y);
2250      FETCH(&r[2], 0, TGSI_CHAN_Z);
2251
2252      fetch_texel(mach->Samplers[sampler_unit],
2253                  &r[0], &r[1], &r[2], &ZeroVec,
2254                  tgsi_sampler_lod_bias,
2255                  &r[0], &r[1], &r[2], &r[3]);
2256      break;
2257
2258   default:
2259      assert(0);
2260   }
2261
2262   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2263      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2264         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2265      }
2266   }
2267}
2268
2269
2270/**
2271 * Evaluate a constant-valued coefficient at the position of the
2272 * current quad.
2273 */
2274static void
2275eval_constant_coef(
2276   struct tgsi_exec_machine *mach,
2277   unsigned attrib,
2278   unsigned chan )
2279{
2280   unsigned i;
2281
2282   for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
2283      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2284   }
2285}
2286
2287/**
2288 * Evaluate a linear-valued coefficient at the position of the
2289 * current quad.
2290 */
2291static void
2292eval_linear_coef(
2293   struct tgsi_exec_machine *mach,
2294   unsigned attrib,
2295   unsigned chan )
2296{
2297   const float x = mach->QuadPos.xyzw[0].f[0];
2298   const float y = mach->QuadPos.xyzw[1].f[0];
2299   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2300   const float dady = mach->InterpCoefs[attrib].dady[chan];
2301   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2302   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2303   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2304   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2305   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2306}
2307
2308/**
2309 * Evaluate a perspective-valued coefficient at the position of the
2310 * current quad.
2311 */
2312static void
2313eval_perspective_coef(
2314   struct tgsi_exec_machine *mach,
2315   unsigned attrib,
2316   unsigned chan )
2317{
2318   const float x = mach->QuadPos.xyzw[0].f[0];
2319   const float y = mach->QuadPos.xyzw[1].f[0];
2320   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2321   const float dady = mach->InterpCoefs[attrib].dady[chan];
2322   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2323   const float *w = mach->QuadPos.xyzw[3].f;
2324   /* divide by W here */
2325   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2326   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2327   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2328   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2329}
2330
2331
2332typedef void (* eval_coef_func)(
2333   struct tgsi_exec_machine *mach,
2334   unsigned attrib,
2335   unsigned chan );
2336
2337static void
2338exec_declaration(struct tgsi_exec_machine *mach,
2339                 const struct tgsi_full_declaration *decl)
2340{
2341   if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
2342      mach->SamplerViews[decl->Range.First] = decl->SamplerView;
2343      return;
2344   }
2345
2346   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2347      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2348         uint first, last, mask;
2349
2350         first = decl->Range.First;
2351         last = decl->Range.Last;
2352         mask = decl->Declaration.UsageMask;
2353
2354         /* XXX we could remove this special-case code since
2355          * mach->InterpCoefs[first].a0 should already have the
2356          * front/back-face value.  But we should first update the
2357          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2358          * Then, we could remove the tgsi_exec_machine::Face field.
2359          */
2360         /* XXX make FACE a system value */
2361         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2362            uint i;
2363
2364            assert(decl->Semantic.Index == 0);
2365            assert(first == last);
2366
2367            for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2368               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2369            }
2370         } else {
2371            eval_coef_func eval;
2372            uint i, j;
2373
2374            switch (decl->Interp.Interpolate) {
2375            case TGSI_INTERPOLATE_CONSTANT:
2376               eval = eval_constant_coef;
2377               break;
2378
2379            case TGSI_INTERPOLATE_LINEAR:
2380               eval = eval_linear_coef;
2381               break;
2382
2383            case TGSI_INTERPOLATE_PERSPECTIVE:
2384               eval = eval_perspective_coef;
2385               break;
2386
2387            case TGSI_INTERPOLATE_COLOR:
2388               eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
2389               break;
2390
2391            default:
2392               assert(0);
2393               return;
2394            }
2395
2396            for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
2397               if (mask & (1 << j)) {
2398                  for (i = first; i <= last; i++) {
2399                     eval(mach, i, j);
2400                  }
2401               }
2402            }
2403         }
2404      }
2405   }
2406
2407   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2408      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2409   }
2410}
2411
2412
2413typedef void (* micro_op)(union tgsi_exec_channel *dst);
2414
2415static void
2416exec_vector(struct tgsi_exec_machine *mach,
2417            const struct tgsi_full_instruction *inst,
2418            micro_op op,
2419            enum tgsi_exec_datatype dst_datatype)
2420{
2421   unsigned int chan;
2422
2423   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2424      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2425         union tgsi_exec_channel dst;
2426
2427         op(&dst);
2428         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2429      }
2430   }
2431}
2432
2433typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2434                                const union tgsi_exec_channel *src);
2435
2436static void
2437exec_scalar_unary(struct tgsi_exec_machine *mach,
2438                  const struct tgsi_full_instruction *inst,
2439                  micro_unary_op op,
2440                  enum tgsi_exec_datatype dst_datatype,
2441                  enum tgsi_exec_datatype src_datatype)
2442{
2443   unsigned int chan;
2444   union tgsi_exec_channel src;
2445   union tgsi_exec_channel dst;
2446
2447   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
2448   op(&dst, &src);
2449   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2450      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2451         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2452      }
2453   }
2454}
2455
2456static void
2457exec_vector_unary(struct tgsi_exec_machine *mach,
2458                  const struct tgsi_full_instruction *inst,
2459                  micro_unary_op op,
2460                  enum tgsi_exec_datatype dst_datatype,
2461                  enum tgsi_exec_datatype src_datatype)
2462{
2463   unsigned int chan;
2464   struct tgsi_exec_vector dst;
2465
2466   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2467      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2468         union tgsi_exec_channel src;
2469
2470         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2471         op(&dst.xyzw[chan], &src);
2472      }
2473   }
2474   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2475      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2476         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2477      }
2478   }
2479}
2480
2481typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2482                                 const union tgsi_exec_channel *src0,
2483                                 const union tgsi_exec_channel *src1);
2484
2485static void
2486exec_scalar_binary(struct tgsi_exec_machine *mach,
2487                   const struct tgsi_full_instruction *inst,
2488                   micro_binary_op op,
2489                   enum tgsi_exec_datatype dst_datatype,
2490                   enum tgsi_exec_datatype src_datatype)
2491{
2492   unsigned int chan;
2493   union tgsi_exec_channel src[2];
2494   union tgsi_exec_channel dst;
2495
2496   fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
2497   fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_Y, src_datatype);
2498   op(&dst, &src[0], &src[1]);
2499   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2500      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2501         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2502      }
2503   }
2504}
2505
2506static void
2507exec_vector_binary(struct tgsi_exec_machine *mach,
2508                   const struct tgsi_full_instruction *inst,
2509                   micro_binary_op op,
2510                   enum tgsi_exec_datatype dst_datatype,
2511                   enum tgsi_exec_datatype src_datatype)
2512{
2513   unsigned int chan;
2514   struct tgsi_exec_vector dst;
2515
2516   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2517      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2518         union tgsi_exec_channel src[2];
2519
2520         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2521         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2522         op(&dst.xyzw[chan], &src[0], &src[1]);
2523      }
2524   }
2525   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2526      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2527         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2528      }
2529   }
2530}
2531
2532typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2533                                  const union tgsi_exec_channel *src0,
2534                                  const union tgsi_exec_channel *src1,
2535                                  const union tgsi_exec_channel *src2);
2536
2537static void
2538exec_vector_trinary(struct tgsi_exec_machine *mach,
2539                    const struct tgsi_full_instruction *inst,
2540                    micro_trinary_op op,
2541                    enum tgsi_exec_datatype dst_datatype,
2542                    enum tgsi_exec_datatype src_datatype)
2543{
2544   unsigned int chan;
2545   struct tgsi_exec_vector dst;
2546
2547   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2548      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2549         union tgsi_exec_channel src[3];
2550
2551         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2552         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2553         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2554         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2555      }
2556   }
2557   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2558      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2559         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2560      }
2561   }
2562}
2563
2564static void
2565exec_dp3(struct tgsi_exec_machine *mach,
2566         const struct tgsi_full_instruction *inst)
2567{
2568   unsigned int chan;
2569   union tgsi_exec_channel arg[3];
2570
2571   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2572   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2573   micro_mul(&arg[2], &arg[0], &arg[1]);
2574
2575   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
2576      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2577      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2578      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2579   }
2580
2581   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2582      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2583         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2584      }
2585   }
2586}
2587
2588static void
2589exec_dp4(struct tgsi_exec_machine *mach,
2590         const struct tgsi_full_instruction *inst)
2591{
2592   unsigned int chan;
2593   union tgsi_exec_channel arg[3];
2594
2595   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2596   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2597   micro_mul(&arg[2], &arg[0], &arg[1]);
2598
2599   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
2600      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2601      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2602      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2603   }
2604
2605   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2606      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2607         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2608      }
2609   }
2610}
2611
2612static void
2613exec_dp2a(struct tgsi_exec_machine *mach,
2614          const struct tgsi_full_instruction *inst)
2615{
2616   unsigned int chan;
2617   union tgsi_exec_channel arg[3];
2618
2619   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2620   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2621   micro_mul(&arg[2], &arg[0], &arg[1]);
2622
2623   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2624   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2625   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2626
2627   fetch_source(mach, &arg[1], &inst->Src[2], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2628   micro_add(&arg[0], &arg[0], &arg[1]);
2629
2630   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2631      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2632         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2633      }
2634   }
2635}
2636
2637static void
2638exec_dph(struct tgsi_exec_machine *mach,
2639         const struct tgsi_full_instruction *inst)
2640{
2641   unsigned int chan;
2642   union tgsi_exec_channel arg[3];
2643
2644   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2645   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2646   micro_mul(&arg[2], &arg[0], &arg[1]);
2647
2648   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2649   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2650   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2651
2652   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2653   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2654   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2655
2656   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2657   micro_add(&arg[0], &arg[0], &arg[1]);
2658
2659   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2660      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2661         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2662      }
2663   }
2664}
2665
2666static void
2667exec_dp2(struct tgsi_exec_machine *mach,
2668         const struct tgsi_full_instruction *inst)
2669{
2670   unsigned int chan;
2671   union tgsi_exec_channel arg[3];
2672
2673   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2674   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2675   micro_mul(&arg[2], &arg[0], &arg[1]);
2676
2677   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2678   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2679   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2680
2681   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2682      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2683         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2684      }
2685   }
2686}
2687
2688static void
2689exec_nrm4(struct tgsi_exec_machine *mach,
2690          const struct tgsi_full_instruction *inst)
2691{
2692   unsigned int chan;
2693   union tgsi_exec_channel arg[4];
2694   union tgsi_exec_channel scale;
2695
2696   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2697   micro_mul(&scale, &arg[0], &arg[0]);
2698
2699   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
2700      union tgsi_exec_channel product;
2701
2702      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2703      micro_mul(&product, &arg[chan], &arg[chan]);
2704      micro_add(&scale, &scale, &product);
2705   }
2706
2707   micro_rsq(&scale, &scale);
2708
2709   for (chan = TGSI_CHAN_X; chan <= TGSI_CHAN_W; chan++) {
2710      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2711         micro_mul(&arg[chan], &arg[chan], &scale);
2712         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2713      }
2714   }
2715}
2716
2717static void
2718exec_nrm3(struct tgsi_exec_machine *mach,
2719          const struct tgsi_full_instruction *inst)
2720{
2721   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2722      unsigned int chan;
2723      union tgsi_exec_channel arg[3];
2724      union tgsi_exec_channel scale;
2725
2726      fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2727      micro_mul(&scale, &arg[0], &arg[0]);
2728
2729      for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
2730         union tgsi_exec_channel product;
2731
2732         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2733         micro_mul(&product, &arg[chan], &arg[chan]);
2734         micro_add(&scale, &scale, &product);
2735      }
2736
2737      micro_rsq(&scale, &scale);
2738
2739      for (chan = TGSI_CHAN_X; chan <= TGSI_CHAN_Z; chan++) {
2740         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2741            micro_mul(&arg[chan], &arg[chan], &scale);
2742            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2743         }
2744      }
2745   }
2746
2747   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2748      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2749   }
2750}
2751
2752static void
2753exec_scs(struct tgsi_exec_machine *mach,
2754         const struct tgsi_full_instruction *inst)
2755{
2756   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2757      union tgsi_exec_channel arg;
2758      union tgsi_exec_channel result;
2759
2760      fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2761
2762      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2763         micro_cos(&result, &arg);
2764         store_dest(mach, &result, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2765      }
2766      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2767         micro_sin(&result, &arg);
2768         store_dest(mach, &result, &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2769      }
2770   }
2771   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2772      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2773   }
2774   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2775      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2776   }
2777}
2778
2779static void
2780exec_x2d(struct tgsi_exec_machine *mach,
2781         const struct tgsi_full_instruction *inst)
2782{
2783   union tgsi_exec_channel r[4];
2784   union tgsi_exec_channel d[2];
2785
2786   fetch_source(mach, &r[0], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2787   fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2788   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2789      fetch_source(mach, &r[2], &inst->Src[2], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2790      micro_mul(&r[2], &r[2], &r[0]);
2791      fetch_source(mach, &r[3], &inst->Src[2], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2792      micro_mul(&r[3], &r[3], &r[1]);
2793      micro_add(&r[2], &r[2], &r[3]);
2794      fetch_source(mach, &r[3], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2795      micro_add(&d[0], &r[2], &r[3]);
2796   }
2797   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2798      fetch_source(mach, &r[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2799      micro_mul(&r[2], &r[2], &r[0]);
2800      fetch_source(mach, &r[3], &inst->Src[2], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2801      micro_mul(&r[3], &r[3], &r[1]);
2802      micro_add(&r[2], &r[2], &r[3]);
2803      fetch_source(mach, &r[3], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2804      micro_add(&d[1], &r[2], &r[3]);
2805   }
2806   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2807      store_dest(mach, &d[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2808   }
2809   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2810      store_dest(mach, &d[1], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2811   }
2812   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2813      store_dest(mach, &d[0], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2814   }
2815   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2816      store_dest(mach, &d[1], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2817   }
2818}
2819
2820static void
2821exec_rfl(struct tgsi_exec_machine *mach,
2822         const struct tgsi_full_instruction *inst)
2823{
2824   union tgsi_exec_channel r[9];
2825
2826   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2827      /* r0 = dp3(src0, src0) */
2828      fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2829      micro_mul(&r[0], &r[2], &r[2]);
2830      fetch_source(mach, &r[4], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2831      micro_mul(&r[8], &r[4], &r[4]);
2832      micro_add(&r[0], &r[0], &r[8]);
2833      fetch_source(mach, &r[6], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2834      micro_mul(&r[8], &r[6], &r[6]);
2835      micro_add(&r[0], &r[0], &r[8]);
2836
2837      /* r1 = dp3(src0, src1) */
2838      fetch_source(mach, &r[3], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2839      micro_mul(&r[1], &r[2], &r[3]);
2840      fetch_source(mach, &r[5], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2841      micro_mul(&r[8], &r[4], &r[5]);
2842      micro_add(&r[1], &r[1], &r[8]);
2843      fetch_source(mach, &r[7], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2844      micro_mul(&r[8], &r[6], &r[7]);
2845      micro_add(&r[1], &r[1], &r[8]);
2846
2847      /* r1 = 2 * r1 / r0 */
2848      micro_add(&r[1], &r[1], &r[1]);
2849      micro_div(&r[1], &r[1], &r[0]);
2850
2851      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2852         micro_mul(&r[2], &r[2], &r[1]);
2853         micro_sub(&r[2], &r[2], &r[3]);
2854         store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2855      }
2856      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2857         micro_mul(&r[4], &r[4], &r[1]);
2858         micro_sub(&r[4], &r[4], &r[5]);
2859         store_dest(mach, &r[4], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2860      }
2861      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2862         micro_mul(&r[6], &r[6], &r[1]);
2863         micro_sub(&r[6], &r[6], &r[7]);
2864         store_dest(mach, &r[6], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2865      }
2866   }
2867   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2868      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2869   }
2870}
2871
2872static void
2873exec_xpd(struct tgsi_exec_machine *mach,
2874         const struct tgsi_full_instruction *inst)
2875{
2876   union tgsi_exec_channel r[6];
2877   union tgsi_exec_channel d[3];
2878
2879   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2880   fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2881
2882   micro_mul(&r[2], &r[0], &r[1]);
2883
2884   fetch_source(mach, &r[3], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2885   fetch_source(mach, &r[4], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2886
2887   micro_mul(&r[5], &r[3], &r[4] );
2888   micro_sub(&d[TGSI_CHAN_X], &r[2], &r[5]);
2889
2890   fetch_source(mach, &r[2], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2891
2892   micro_mul(&r[3], &r[3], &r[2]);
2893
2894   fetch_source(mach, &r[5], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2895
2896   micro_mul(&r[1], &r[1], &r[5]);
2897   micro_sub(&d[TGSI_CHAN_Y], &r[3], &r[1]);
2898
2899   micro_mul(&r[5], &r[5], &r[4]);
2900   micro_mul(&r[0], &r[0], &r[2]);
2901   micro_sub(&d[TGSI_CHAN_Z], &r[5], &r[0]);
2902
2903   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2904      store_dest(mach, &d[TGSI_CHAN_X], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2905   }
2906   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2907      store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2908   }
2909   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2910      store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2911   }
2912   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2913      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2914   }
2915}
2916
2917static void
2918exec_dst(struct tgsi_exec_machine *mach,
2919         const struct tgsi_full_instruction *inst)
2920{
2921   union tgsi_exec_channel r[2];
2922   union tgsi_exec_channel d[4];
2923
2924   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2925      fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2926      fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2927      micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
2928   }
2929   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2930      fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2931   }
2932   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2933      fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2934   }
2935
2936   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2937      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2938   }
2939   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2940      store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2941   }
2942   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2943      store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2944   }
2945   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2946      store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2947   }
2948}
2949
2950static void
2951exec_log(struct tgsi_exec_machine *mach,
2952         const struct tgsi_full_instruction *inst)
2953{
2954   union tgsi_exec_channel r[3];
2955
2956   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2957   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2958   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2959   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2960   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2961      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2962   }
2963   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2964      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2965      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2966      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2967   }
2968   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2969      store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2970   }
2971   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2972      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2973   }
2974}
2975
2976static void
2977exec_exp(struct tgsi_exec_machine *mach,
2978         const struct tgsi_full_instruction *inst)
2979{
2980   union tgsi_exec_channel r[3];
2981
2982   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2983   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2984   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2985      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2986      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
2987   }
2988   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2989      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2990      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2991   }
2992   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2993      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2994      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2995   }
2996   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2997      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
2998   }
2999}
3000
3001static void
3002exec_lit(struct tgsi_exec_machine *mach,
3003         const struct tgsi_full_instruction *inst)
3004{
3005   union tgsi_exec_channel r[3];
3006   union tgsi_exec_channel d[3];
3007
3008   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3009      fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3010      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3011         fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3012         micro_max(&r[1], &r[1], &ZeroVec);
3013
3014         fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3015         micro_min(&r[2], &r[2], &P128Vec);
3016         micro_max(&r[2], &r[2], &M128Vec);
3017         micro_pow(&r[1], &r[1], &r[2]);
3018         micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3019         store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3020      }
3021      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3022         micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
3023         store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3024      }
3025   }
3026   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3027      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3028   }
3029
3030   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3031      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3032   }
3033}
3034
3035static void
3036exec_break(struct tgsi_exec_machine *mach)
3037{
3038   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3039      /* turn off loop channels for each enabled exec channel */
3040      mach->LoopMask &= ~mach->ExecMask;
3041      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3042      UPDATE_EXEC_MASK(mach);
3043   } else {
3044      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3045
3046      mach->Switch.mask = 0x0;
3047
3048      UPDATE_EXEC_MASK(mach);
3049   }
3050}
3051
3052static void
3053exec_switch(struct tgsi_exec_machine *mach,
3054            const struct tgsi_full_instruction *inst)
3055{
3056   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3057   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3058
3059   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3060   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3061   mach->Switch.mask = 0x0;
3062   mach->Switch.defaultMask = 0x0;
3063
3064   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3065   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3066
3067   UPDATE_EXEC_MASK(mach);
3068}
3069
3070static void
3071exec_case(struct tgsi_exec_machine *mach,
3072          const struct tgsi_full_instruction *inst)
3073{
3074   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3075   union tgsi_exec_channel src;
3076   uint mask = 0;
3077
3078   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3079
3080   if (mach->Switch.selector.u[0] == src.u[0]) {
3081      mask |= 0x1;
3082   }
3083   if (mach->Switch.selector.u[1] == src.u[1]) {
3084      mask |= 0x2;
3085   }
3086   if (mach->Switch.selector.u[2] == src.u[2]) {
3087      mask |= 0x4;
3088   }
3089   if (mach->Switch.selector.u[3] == src.u[3]) {
3090      mask |= 0x8;
3091   }
3092
3093   mach->Switch.defaultMask |= mask;
3094
3095   mach->Switch.mask |= mask & prevMask;
3096
3097   UPDATE_EXEC_MASK(mach);
3098}
3099
3100static void
3101exec_default(struct tgsi_exec_machine *mach)
3102{
3103   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3104
3105   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3106
3107   UPDATE_EXEC_MASK(mach);
3108}
3109
3110static void
3111exec_endswitch(struct tgsi_exec_machine *mach)
3112{
3113   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3114   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3115
3116   UPDATE_EXEC_MASK(mach);
3117}
3118
3119static void
3120micro_i2f(union tgsi_exec_channel *dst,
3121          const union tgsi_exec_channel *src)
3122{
3123   dst->f[0] = (float)src->i[0];
3124   dst->f[1] = (float)src->i[1];
3125   dst->f[2] = (float)src->i[2];
3126   dst->f[3] = (float)src->i[3];
3127}
3128
3129static void
3130micro_not(union tgsi_exec_channel *dst,
3131          const union tgsi_exec_channel *src)
3132{
3133   dst->u[0] = ~src->u[0];
3134   dst->u[1] = ~src->u[1];
3135   dst->u[2] = ~src->u[2];
3136   dst->u[3] = ~src->u[3];
3137}
3138
3139static void
3140micro_shl(union tgsi_exec_channel *dst,
3141          const union tgsi_exec_channel *src0,
3142          const union tgsi_exec_channel *src1)
3143{
3144   dst->u[0] = src0->u[0] << src1->u[0];
3145   dst->u[1] = src0->u[1] << src1->u[1];
3146   dst->u[2] = src0->u[2] << src1->u[2];
3147   dst->u[3] = src0->u[3] << src1->u[3];
3148}
3149
3150static void
3151micro_and(union tgsi_exec_channel *dst,
3152          const union tgsi_exec_channel *src0,
3153          const union tgsi_exec_channel *src1)
3154{
3155   dst->u[0] = src0->u[0] & src1->u[0];
3156   dst->u[1] = src0->u[1] & src1->u[1];
3157   dst->u[2] = src0->u[2] & src1->u[2];
3158   dst->u[3] = src0->u[3] & src1->u[3];
3159}
3160
3161static void
3162micro_or(union tgsi_exec_channel *dst,
3163         const union tgsi_exec_channel *src0,
3164         const union tgsi_exec_channel *src1)
3165{
3166   dst->u[0] = src0->u[0] | src1->u[0];
3167   dst->u[1] = src0->u[1] | src1->u[1];
3168   dst->u[2] = src0->u[2] | src1->u[2];
3169   dst->u[3] = src0->u[3] | src1->u[3];
3170}
3171
3172static void
3173micro_xor(union tgsi_exec_channel *dst,
3174          const union tgsi_exec_channel *src0,
3175          const union tgsi_exec_channel *src1)
3176{
3177   dst->u[0] = src0->u[0] ^ src1->u[0];
3178   dst->u[1] = src0->u[1] ^ src1->u[1];
3179   dst->u[2] = src0->u[2] ^ src1->u[2];
3180   dst->u[3] = src0->u[3] ^ src1->u[3];
3181}
3182
3183static void
3184micro_mod(union tgsi_exec_channel *dst,
3185          const union tgsi_exec_channel *src0,
3186          const union tgsi_exec_channel *src1)
3187{
3188   dst->i[0] = src0->i[0] % src1->i[0];
3189   dst->i[1] = src0->i[1] % src1->i[1];
3190   dst->i[2] = src0->i[2] % src1->i[2];
3191   dst->i[3] = src0->i[3] % src1->i[3];
3192}
3193
3194static void
3195micro_f2i(union tgsi_exec_channel *dst,
3196          const union tgsi_exec_channel *src)
3197{
3198   dst->i[0] = (int)src->f[0];
3199   dst->i[1] = (int)src->f[1];
3200   dst->i[2] = (int)src->f[2];
3201   dst->i[3] = (int)src->f[3];
3202}
3203
3204static void
3205micro_idiv(union tgsi_exec_channel *dst,
3206           const union tgsi_exec_channel *src0,
3207           const union tgsi_exec_channel *src1)
3208{
3209   dst->i[0] = src0->i[0] / src1->i[0];
3210   dst->i[1] = src0->i[1] / src1->i[1];
3211   dst->i[2] = src0->i[2] / src1->i[2];
3212   dst->i[3] = src0->i[3] / src1->i[3];
3213}
3214
3215static void
3216micro_imax(union tgsi_exec_channel *dst,
3217           const union tgsi_exec_channel *src0,
3218           const union tgsi_exec_channel *src1)
3219{
3220   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
3221   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
3222   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
3223   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
3224}
3225
3226static void
3227micro_imin(union tgsi_exec_channel *dst,
3228           const union tgsi_exec_channel *src0,
3229           const union tgsi_exec_channel *src1)
3230{
3231   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
3232   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
3233   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
3234   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
3235}
3236
3237static void
3238micro_isge(union tgsi_exec_channel *dst,
3239           const union tgsi_exec_channel *src0,
3240           const union tgsi_exec_channel *src1)
3241{
3242   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
3243   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
3244   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
3245   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
3246}
3247
3248static void
3249micro_ishr(union tgsi_exec_channel *dst,
3250           const union tgsi_exec_channel *src0,
3251           const union tgsi_exec_channel *src1)
3252{
3253   dst->i[0] = src0->i[0] >> src1->i[0];
3254   dst->i[1] = src0->i[1] >> src1->i[1];
3255   dst->i[2] = src0->i[2] >> src1->i[2];
3256   dst->i[3] = src0->i[3] >> src1->i[3];
3257}
3258
3259static void
3260micro_islt(union tgsi_exec_channel *dst,
3261           const union tgsi_exec_channel *src0,
3262           const union tgsi_exec_channel *src1)
3263{
3264   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3265   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3266   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3267   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3268}
3269
3270static void
3271micro_f2u(union tgsi_exec_channel *dst,
3272          const union tgsi_exec_channel *src)
3273{
3274   dst->u[0] = (uint)src->f[0];
3275   dst->u[1] = (uint)src->f[1];
3276   dst->u[2] = (uint)src->f[2];
3277   dst->u[3] = (uint)src->f[3];
3278}
3279
3280static void
3281micro_u2f(union tgsi_exec_channel *dst,
3282          const union tgsi_exec_channel *src)
3283{
3284   dst->f[0] = (float)src->u[0];
3285   dst->f[1] = (float)src->u[1];
3286   dst->f[2] = (float)src->u[2];
3287   dst->f[3] = (float)src->u[3];
3288}
3289
3290static void
3291micro_uadd(union tgsi_exec_channel *dst,
3292           const union tgsi_exec_channel *src0,
3293           const union tgsi_exec_channel *src1)
3294{
3295   dst->u[0] = src0->u[0] + src1->u[0];
3296   dst->u[1] = src0->u[1] + src1->u[1];
3297   dst->u[2] = src0->u[2] + src1->u[2];
3298   dst->u[3] = src0->u[3] + src1->u[3];
3299}
3300
3301static void
3302micro_udiv(union tgsi_exec_channel *dst,
3303           const union tgsi_exec_channel *src0,
3304           const union tgsi_exec_channel *src1)
3305{
3306   dst->u[0] = src0->u[0] / src1->u[0];
3307   dst->u[1] = src0->u[1] / src1->u[1];
3308   dst->u[2] = src0->u[2] / src1->u[2];
3309   dst->u[3] = src0->u[3] / src1->u[3];
3310}
3311
3312static void
3313micro_umad(union tgsi_exec_channel *dst,
3314           const union tgsi_exec_channel *src0,
3315           const union tgsi_exec_channel *src1,
3316           const union tgsi_exec_channel *src2)
3317{
3318   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3319   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3320   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3321   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3322}
3323
3324static void
3325micro_umax(union tgsi_exec_channel *dst,
3326           const union tgsi_exec_channel *src0,
3327           const union tgsi_exec_channel *src1)
3328{
3329   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3330   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3331   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3332   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3333}
3334
3335static void
3336micro_umin(union tgsi_exec_channel *dst,
3337           const union tgsi_exec_channel *src0,
3338           const union tgsi_exec_channel *src1)
3339{
3340   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3341   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3342   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3343   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3344}
3345
3346static void
3347micro_umod(union tgsi_exec_channel *dst,
3348           const union tgsi_exec_channel *src0,
3349           const union tgsi_exec_channel *src1)
3350{
3351   dst->u[0] = src0->u[0] % src1->u[0];
3352   dst->u[1] = src0->u[1] % src1->u[1];
3353   dst->u[2] = src0->u[2] % src1->u[2];
3354   dst->u[3] = src0->u[3] % src1->u[3];
3355}
3356
3357static void
3358micro_umul(union tgsi_exec_channel *dst,
3359           const union tgsi_exec_channel *src0,
3360           const union tgsi_exec_channel *src1)
3361{
3362   dst->u[0] = src0->u[0] * src1->u[0];
3363   dst->u[1] = src0->u[1] * src1->u[1];
3364   dst->u[2] = src0->u[2] * src1->u[2];
3365   dst->u[3] = src0->u[3] * src1->u[3];
3366}
3367
3368static void
3369micro_useq(union tgsi_exec_channel *dst,
3370           const union tgsi_exec_channel *src0,
3371           const union tgsi_exec_channel *src1)
3372{
3373   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3374   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3375   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3376   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3377}
3378
3379static void
3380micro_usge(union tgsi_exec_channel *dst,
3381           const union tgsi_exec_channel *src0,
3382           const union tgsi_exec_channel *src1)
3383{
3384   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3385   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3386   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3387   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3388}
3389
3390static void
3391micro_ushr(union tgsi_exec_channel *dst,
3392           const union tgsi_exec_channel *src0,
3393           const union tgsi_exec_channel *src1)
3394{
3395   dst->u[0] = src0->u[0] >> src1->u[0];
3396   dst->u[1] = src0->u[1] >> src1->u[1];
3397   dst->u[2] = src0->u[2] >> src1->u[2];
3398   dst->u[3] = src0->u[3] >> src1->u[3];
3399}
3400
3401static void
3402micro_uslt(union tgsi_exec_channel *dst,
3403           const union tgsi_exec_channel *src0,
3404           const union tgsi_exec_channel *src1)
3405{
3406   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3407   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3408   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3409   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3410}
3411
3412static void
3413micro_usne(union tgsi_exec_channel *dst,
3414           const union tgsi_exec_channel *src0,
3415           const union tgsi_exec_channel *src1)
3416{
3417   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3418   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3419   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3420   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3421}
3422
3423static void
3424micro_uarl(union tgsi_exec_channel *dst,
3425           const union tgsi_exec_channel *src)
3426{
3427   dst->i[0] = src->u[0];
3428   dst->i[1] = src->u[1];
3429   dst->i[2] = src->u[2];
3430   dst->i[3] = src->u[3];
3431}
3432
3433static void
3434micro_ucmp(union tgsi_exec_channel *dst,
3435           const union tgsi_exec_channel *src0,
3436           const union tgsi_exec_channel *src1,
3437           const union tgsi_exec_channel *src2)
3438{
3439   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
3440   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
3441   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
3442   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
3443}
3444
3445static void
3446exec_instruction(
3447   struct tgsi_exec_machine *mach,
3448   const struct tgsi_full_instruction *inst,
3449   int *pc )
3450{
3451   union tgsi_exec_channel r[10];
3452
3453   (*pc)++;
3454
3455   switch (inst->Instruction.Opcode) {
3456   case TGSI_OPCODE_ARL:
3457      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3458      break;
3459
3460   case TGSI_OPCODE_MOV:
3461      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3462      break;
3463
3464   case TGSI_OPCODE_LIT:
3465      exec_lit(mach, inst);
3466      break;
3467
3468   case TGSI_OPCODE_RCP:
3469      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3470      break;
3471
3472   case TGSI_OPCODE_RSQ:
3473      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3474      break;
3475
3476   case TGSI_OPCODE_EXP:
3477      exec_exp(mach, inst);
3478      break;
3479
3480   case TGSI_OPCODE_LOG:
3481      exec_log(mach, inst);
3482      break;
3483
3484   case TGSI_OPCODE_MUL:
3485      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3486      break;
3487
3488   case TGSI_OPCODE_ADD:
3489      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3490      break;
3491
3492   case TGSI_OPCODE_DP3:
3493      exec_dp3(mach, inst);
3494      break;
3495
3496   case TGSI_OPCODE_DP4:
3497      exec_dp4(mach, inst);
3498      break;
3499
3500   case TGSI_OPCODE_DST:
3501      exec_dst(mach, inst);
3502      break;
3503
3504   case TGSI_OPCODE_MIN:
3505      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3506      break;
3507
3508   case TGSI_OPCODE_MAX:
3509      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3510      break;
3511
3512   case TGSI_OPCODE_SLT:
3513      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3514      break;
3515
3516   case TGSI_OPCODE_SGE:
3517      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3518      break;
3519
3520   case TGSI_OPCODE_MAD:
3521      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3522      break;
3523
3524   case TGSI_OPCODE_SUB:
3525      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3526      break;
3527
3528   case TGSI_OPCODE_LRP:
3529      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3530      break;
3531
3532   case TGSI_OPCODE_CND:
3533      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3534      break;
3535
3536   case TGSI_OPCODE_DP2A:
3537      exec_dp2a(mach, inst);
3538      break;
3539
3540   case TGSI_OPCODE_FRC:
3541      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3542      break;
3543
3544   case TGSI_OPCODE_CLAMP:
3545      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3546      break;
3547
3548   case TGSI_OPCODE_FLR:
3549      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3550      break;
3551
3552   case TGSI_OPCODE_ROUND:
3553      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3554      break;
3555
3556   case TGSI_OPCODE_EX2:
3557      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3558      break;
3559
3560   case TGSI_OPCODE_LG2:
3561      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3562      break;
3563
3564   case TGSI_OPCODE_POW:
3565      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3566      break;
3567
3568   case TGSI_OPCODE_XPD:
3569      exec_xpd(mach, inst);
3570      break;
3571
3572   case TGSI_OPCODE_ABS:
3573      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3574      break;
3575
3576   case TGSI_OPCODE_RCC:
3577      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3578      break;
3579
3580   case TGSI_OPCODE_DPH:
3581      exec_dph(mach, inst);
3582      break;
3583
3584   case TGSI_OPCODE_COS:
3585      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3586      break;
3587
3588   case TGSI_OPCODE_DDX:
3589      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3590      break;
3591
3592   case TGSI_OPCODE_DDY:
3593      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3594      break;
3595
3596   case TGSI_OPCODE_KILP:
3597      exec_kilp (mach, inst);
3598      break;
3599
3600   case TGSI_OPCODE_KIL:
3601      exec_kil (mach, inst);
3602      break;
3603
3604   case TGSI_OPCODE_PK2H:
3605      assert (0);
3606      break;
3607
3608   case TGSI_OPCODE_PK2US:
3609      assert (0);
3610      break;
3611
3612   case TGSI_OPCODE_PK4B:
3613      assert (0);
3614      break;
3615
3616   case TGSI_OPCODE_PK4UB:
3617      assert (0);
3618      break;
3619
3620   case TGSI_OPCODE_RFL:
3621      exec_rfl(mach, inst);
3622      break;
3623
3624   case TGSI_OPCODE_SEQ:
3625      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3626      break;
3627
3628   case TGSI_OPCODE_SFL:
3629      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3630      break;
3631
3632   case TGSI_OPCODE_SGT:
3633      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3634      break;
3635
3636   case TGSI_OPCODE_SIN:
3637      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3638      break;
3639
3640   case TGSI_OPCODE_SLE:
3641      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3642      break;
3643
3644   case TGSI_OPCODE_SNE:
3645      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3646      break;
3647
3648   case TGSI_OPCODE_STR:
3649      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3650      break;
3651
3652   case TGSI_OPCODE_TEX:
3653      /* simple texture lookup */
3654      /* src[0] = texcoord */
3655      /* src[1] = sampler unit */
3656      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3657      break;
3658
3659   case TGSI_OPCODE_TXB:
3660      /* Texture lookup with lod bias */
3661      /* src[0] = texcoord (src[0].w = LOD bias) */
3662      /* src[1] = sampler unit */
3663      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3664      break;
3665
3666   case TGSI_OPCODE_TXD:
3667      /* Texture lookup with explict partial derivatives */
3668      /* src[0] = texcoord */
3669      /* src[1] = d[strq]/dx */
3670      /* src[2] = d[strq]/dy */
3671      /* src[3] = sampler unit */
3672      exec_txd(mach, inst);
3673      break;
3674
3675   case TGSI_OPCODE_TXL:
3676      /* Texture lookup with explit LOD */
3677      /* src[0] = texcoord (src[0].w = LOD) */
3678      /* src[1] = sampler unit */
3679      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3680      break;
3681
3682   case TGSI_OPCODE_TXP:
3683      /* Texture lookup with projection */
3684      /* src[0] = texcoord (src[0].w = projection) */
3685      /* src[1] = sampler unit */
3686      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3687      break;
3688
3689   case TGSI_OPCODE_UP2H:
3690      assert (0);
3691      break;
3692
3693   case TGSI_OPCODE_UP2US:
3694      assert (0);
3695      break;
3696
3697   case TGSI_OPCODE_UP4B:
3698      assert (0);
3699      break;
3700
3701   case TGSI_OPCODE_UP4UB:
3702      assert (0);
3703      break;
3704
3705   case TGSI_OPCODE_X2D:
3706      exec_x2d(mach, inst);
3707      break;
3708
3709   case TGSI_OPCODE_ARA:
3710      assert (0);
3711      break;
3712
3713   case TGSI_OPCODE_ARR:
3714      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3715      break;
3716
3717   case TGSI_OPCODE_BRA:
3718      assert (0);
3719      break;
3720
3721   case TGSI_OPCODE_CAL:
3722      /* skip the call if no execution channels are enabled */
3723      if (mach->ExecMask) {
3724         /* do the call */
3725
3726         /* First, record the depths of the execution stacks.
3727          * This is important for deeply nested/looped return statements.
3728          * We have to unwind the stacks by the correct amount.  For a
3729          * real code generator, we could determine the number of entries
3730          * to pop off each stack with simple static analysis and avoid
3731          * implementing this data structure at run time.
3732          */
3733         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3734         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3735         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3736         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3737         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3738         /* note that PC was already incremented above */
3739         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3740
3741         mach->CallStackTop++;
3742
3743         /* Second, push the Cond, Loop, Cont, Func stacks */
3744         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3745         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3746         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3747         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3748         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3749         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3750
3751         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3752         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3753         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3754         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3755         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3756         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3757
3758         /* Finally, jump to the subroutine */
3759         *pc = inst->Label.Label;
3760      }
3761      break;
3762
3763   case TGSI_OPCODE_RET:
3764      mach->FuncMask &= ~mach->ExecMask;
3765      UPDATE_EXEC_MASK(mach);
3766
3767      if (mach->FuncMask == 0x0) {
3768         /* really return now (otherwise, keep executing */
3769
3770         if (mach->CallStackTop == 0) {
3771            /* returning from main() */
3772            mach->CondStackTop = 0;
3773            mach->LoopStackTop = 0;
3774            *pc = -1;
3775            return;
3776         }
3777
3778         assert(mach->CallStackTop > 0);
3779         mach->CallStackTop--;
3780
3781         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3782         mach->CondMask = mach->CondStack[mach->CondStackTop];
3783
3784         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3785         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3786
3787         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3788         mach->ContMask = mach->ContStack[mach->ContStackTop];
3789
3790         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3791         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3792
3793         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3794         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3795
3796         assert(mach->FuncStackTop > 0);
3797         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3798
3799         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3800
3801         UPDATE_EXEC_MASK(mach);
3802      }
3803      break;
3804
3805   case TGSI_OPCODE_SSG:
3806      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3807      break;
3808
3809   case TGSI_OPCODE_CMP:
3810      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3811      break;
3812
3813   case TGSI_OPCODE_SCS:
3814      exec_scs(mach, inst);
3815      break;
3816
3817   case TGSI_OPCODE_NRM:
3818      exec_nrm3(mach, inst);
3819      break;
3820
3821   case TGSI_OPCODE_NRM4:
3822      exec_nrm4(mach, inst);
3823      break;
3824
3825   case TGSI_OPCODE_DIV:
3826      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3827      break;
3828
3829   case TGSI_OPCODE_DP2:
3830      exec_dp2(mach, inst);
3831      break;
3832
3833   case TGSI_OPCODE_IF:
3834      /* push CondMask */
3835      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3836      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3837      FETCH( &r[0], 0, TGSI_CHAN_X );
3838      /* update CondMask */
3839      if( ! r[0].u[0] ) {
3840         mach->CondMask &= ~0x1;
3841      }
3842      if( ! r[0].u[1] ) {
3843         mach->CondMask &= ~0x2;
3844      }
3845      if( ! r[0].u[2] ) {
3846         mach->CondMask &= ~0x4;
3847      }
3848      if( ! r[0].u[3] ) {
3849         mach->CondMask &= ~0x8;
3850      }
3851      UPDATE_EXEC_MASK(mach);
3852      /* Todo: If CondMask==0, jump to ELSE */
3853      break;
3854
3855   case TGSI_OPCODE_ELSE:
3856      /* invert CondMask wrt previous mask */
3857      {
3858         uint prevMask;
3859         assert(mach->CondStackTop > 0);
3860         prevMask = mach->CondStack[mach->CondStackTop - 1];
3861         mach->CondMask = ~mach->CondMask & prevMask;
3862         UPDATE_EXEC_MASK(mach);
3863         /* Todo: If CondMask==0, jump to ENDIF */
3864      }
3865      break;
3866
3867   case TGSI_OPCODE_ENDIF:
3868      /* pop CondMask */
3869      assert(mach->CondStackTop > 0);
3870      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3871      UPDATE_EXEC_MASK(mach);
3872      break;
3873
3874   case TGSI_OPCODE_END:
3875      /* make sure we end primitives which haven't
3876       * been explicitly emitted */
3877      conditional_emit_primitive(mach);
3878      /* halt execution */
3879      *pc = -1;
3880      break;
3881
3882   case TGSI_OPCODE_PUSHA:
3883      assert (0);
3884      break;
3885
3886   case TGSI_OPCODE_POPA:
3887      assert (0);
3888      break;
3889
3890   case TGSI_OPCODE_CEIL:
3891      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3892      break;
3893
3894   case TGSI_OPCODE_I2F:
3895      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3896      break;
3897
3898   case TGSI_OPCODE_NOT:
3899      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3900      break;
3901
3902   case TGSI_OPCODE_TRUNC:
3903      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3904      break;
3905
3906   case TGSI_OPCODE_SHL:
3907      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3908      break;
3909
3910   case TGSI_OPCODE_AND:
3911      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3912      break;
3913
3914   case TGSI_OPCODE_OR:
3915      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3916      break;
3917
3918   case TGSI_OPCODE_MOD:
3919      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3920      break;
3921
3922   case TGSI_OPCODE_XOR:
3923      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3924      break;
3925
3926   case TGSI_OPCODE_SAD:
3927      assert (0);
3928      break;
3929
3930   case TGSI_OPCODE_TXF:
3931      exec_txf(mach, inst);
3932      break;
3933
3934   case TGSI_OPCODE_TXQ:
3935      exec_txq(mach, inst);
3936      break;
3937
3938   case TGSI_OPCODE_EMIT:
3939      emit_vertex(mach);
3940      break;
3941
3942   case TGSI_OPCODE_ENDPRIM:
3943      emit_primitive(mach);
3944      break;
3945
3946   case TGSI_OPCODE_BGNLOOP:
3947      /* push LoopMask and ContMasks */
3948      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3949      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3950      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3951      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3952
3953      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3954      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3955      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3956      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3957      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3958      break;
3959
3960   case TGSI_OPCODE_ENDLOOP:
3961      /* Restore ContMask, but don't pop */
3962      assert(mach->ContStackTop > 0);
3963      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3964      UPDATE_EXEC_MASK(mach);
3965      if (mach->ExecMask) {
3966         /* repeat loop: jump to instruction just past BGNLOOP */
3967         assert(mach->LoopLabelStackTop > 0);
3968         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3969      }
3970      else {
3971         /* exit loop: pop LoopMask */
3972         assert(mach->LoopStackTop > 0);
3973         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3974         /* pop ContMask */
3975         assert(mach->ContStackTop > 0);
3976         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3977         assert(mach->LoopLabelStackTop > 0);
3978         --mach->LoopLabelStackTop;
3979
3980         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3981      }
3982      UPDATE_EXEC_MASK(mach);
3983      break;
3984
3985   case TGSI_OPCODE_BRK:
3986      exec_break(mach);
3987      break;
3988
3989   case TGSI_OPCODE_CONT:
3990      /* turn off cont channels for each enabled exec channel */
3991      mach->ContMask &= ~mach->ExecMask;
3992      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3993      UPDATE_EXEC_MASK(mach);
3994      break;
3995
3996   case TGSI_OPCODE_BGNSUB:
3997      /* no-op */
3998      break;
3999
4000   case TGSI_OPCODE_ENDSUB:
4001      /*
4002       * XXX: This really should be a no-op. We should never reach this opcode.
4003       */
4004
4005      assert(mach->CallStackTop > 0);
4006      mach->CallStackTop--;
4007
4008      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
4009      mach->CondMask = mach->CondStack[mach->CondStackTop];
4010
4011      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
4012      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
4013
4014      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
4015      mach->ContMask = mach->ContStack[mach->ContStackTop];
4016
4017      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
4018      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
4019
4020      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
4021      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
4022
4023      assert(mach->FuncStackTop > 0);
4024      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
4025
4026      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
4027
4028      UPDATE_EXEC_MASK(mach);
4029      break;
4030
4031   case TGSI_OPCODE_NOP:
4032      break;
4033
4034   case TGSI_OPCODE_BREAKC:
4035      FETCH(&r[0], 0, TGSI_CHAN_X);
4036      /* update CondMask */
4037      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
4038         mach->LoopMask &= ~0x1;
4039      }
4040      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
4041         mach->LoopMask &= ~0x2;
4042      }
4043      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
4044         mach->LoopMask &= ~0x4;
4045      }
4046      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
4047         mach->LoopMask &= ~0x8;
4048      }
4049      /* Todo: if mach->LoopMask == 0, jump to end of loop */
4050      UPDATE_EXEC_MASK(mach);
4051      break;
4052
4053   case TGSI_OPCODE_F2I:
4054      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
4055      break;
4056
4057   case TGSI_OPCODE_IDIV:
4058      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4059      break;
4060
4061   case TGSI_OPCODE_IMAX:
4062      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4063      break;
4064
4065   case TGSI_OPCODE_IMIN:
4066      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4067      break;
4068
4069   case TGSI_OPCODE_INEG:
4070      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4071      break;
4072
4073   case TGSI_OPCODE_ISGE:
4074      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4075      break;
4076
4077   case TGSI_OPCODE_ISHR:
4078      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4079      break;
4080
4081   case TGSI_OPCODE_ISLT:
4082      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4083      break;
4084
4085   case TGSI_OPCODE_F2U:
4086      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
4087      break;
4088
4089   case TGSI_OPCODE_U2F:
4090      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
4091      break;
4092
4093   case TGSI_OPCODE_UADD:
4094      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4095      break;
4096
4097   case TGSI_OPCODE_UDIV:
4098      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4099      break;
4100
4101   case TGSI_OPCODE_UMAD:
4102      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4103      break;
4104
4105   case TGSI_OPCODE_UMAX:
4106      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4107      break;
4108
4109   case TGSI_OPCODE_UMIN:
4110      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4111      break;
4112
4113   case TGSI_OPCODE_UMOD:
4114      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4115      break;
4116
4117   case TGSI_OPCODE_UMUL:
4118      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4119      break;
4120
4121   case TGSI_OPCODE_USEQ:
4122      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4123      break;
4124
4125   case TGSI_OPCODE_USGE:
4126      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4127      break;
4128
4129   case TGSI_OPCODE_USHR:
4130      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4131      break;
4132
4133   case TGSI_OPCODE_USLT:
4134      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4135      break;
4136
4137   case TGSI_OPCODE_USNE:
4138      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4139      break;
4140
4141   case TGSI_OPCODE_SWITCH:
4142      exec_switch(mach, inst);
4143      break;
4144
4145   case TGSI_OPCODE_CASE:
4146      exec_case(mach, inst);
4147      break;
4148
4149   case TGSI_OPCODE_DEFAULT:
4150      exec_default(mach);
4151      break;
4152
4153   case TGSI_OPCODE_ENDSWITCH:
4154      exec_endswitch(mach);
4155      break;
4156
4157   case TGSI_OPCODE_SAMPLE_I:
4158      assert(0);
4159      break;
4160
4161   case TGSI_OPCODE_SAMPLE_I_MS:
4162      assert(0);
4163      break;
4164
4165   case TGSI_OPCODE_SAMPLE:
4166      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4167      break;
4168
4169   case TGSI_OPCODE_SAMPLE_B:
4170      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4171      break;
4172
4173   case TGSI_OPCODE_SAMPLE_C:
4174      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4175      break;
4176
4177   case TGSI_OPCODE_SAMPLE_C_LZ:
4178      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4179      break;
4180
4181   case TGSI_OPCODE_SAMPLE_D:
4182      exec_sample_d(mach, inst);
4183      break;
4184
4185   case TGSI_OPCODE_SAMPLE_L:
4186      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
4187      break;
4188
4189   case TGSI_OPCODE_GATHER4:
4190      assert(0);
4191      break;
4192
4193   case TGSI_OPCODE_SVIEWINFO:
4194      assert(0);
4195      break;
4196
4197   case TGSI_OPCODE_SAMPLE_POS:
4198      assert(0);
4199      break;
4200
4201   case TGSI_OPCODE_SAMPLE_INFO:
4202      assert(0);
4203      break;
4204
4205   case TGSI_OPCODE_UARL:
4206      exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
4207      break;
4208
4209   case TGSI_OPCODE_UCMP:
4210      exec_vector_trinary(mach, inst, micro_ucmp, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4211      break;
4212
4213   case TGSI_OPCODE_IABS:
4214      exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4215      break;
4216
4217   case TGSI_OPCODE_ISSG:
4218      exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4219      break;
4220
4221   default:
4222      assert( 0 );
4223   }
4224}
4225
4226
4227#define DEBUG_EXECUTION 0
4228
4229
4230/**
4231 * Run TGSI interpreter.
4232 * \return bitmask of "alive" quad components
4233 */
4234uint
4235tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
4236{
4237   uint i;
4238   int pc = 0;
4239
4240   mach->CondMask = 0xf;
4241   mach->LoopMask = 0xf;
4242   mach->ContMask = 0xf;
4243   mach->FuncMask = 0xf;
4244   mach->ExecMask = 0xf;
4245
4246   mach->Switch.mask = 0xf;
4247
4248   assert(mach->CondStackTop == 0);
4249   assert(mach->LoopStackTop == 0);
4250   assert(mach->ContStackTop == 0);
4251   assert(mach->SwitchStackTop == 0);
4252   assert(mach->BreakStackTop == 0);
4253   assert(mach->CallStackTop == 0);
4254
4255   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
4256   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
4257
4258   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
4259      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
4260      mach->Primitives[0] = 0;
4261   }
4262
4263   /* execute declarations (interpolants) */
4264   for (i = 0; i < mach->NumDeclarations; i++) {
4265      exec_declaration( mach, mach->Declarations+i );
4266   }
4267
4268   {
4269#if DEBUG_EXECUTION
4270      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
4271      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
4272      uint inst = 1;
4273
4274      memcpy(temps, mach->Temps, sizeof(temps));
4275      memcpy(outputs, mach->Outputs, sizeof(outputs));
4276#endif
4277
4278      /* execute instructions, until pc is set to -1 */
4279      while (pc != -1) {
4280
4281#if DEBUG_EXECUTION
4282         uint i;
4283
4284         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
4285#endif
4286
4287         assert(pc < (int) mach->NumInstructions);
4288         exec_instruction(mach, mach->Instructions + pc, &pc);
4289
4290#if DEBUG_EXECUTION
4291         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4292            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4293               uint j;
4294
4295               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4296               debug_printf("TEMP[%2u] = ", i);
4297               for (j = 0; j < 4; j++) {
4298                  if (j > 0) {
4299                     debug_printf("           ");
4300                  }
4301                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4302                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4303                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4304                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4305                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4306               }
4307            }
4308         }
4309         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4310            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4311               uint j;
4312
4313               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4314               debug_printf("OUT[%2u] =  ", i);
4315               for (j = 0; j < 4; j++) {
4316                  if (j > 0) {
4317                     debug_printf("           ");
4318                  }
4319                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4320                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4321                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4322                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4323                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4324               }
4325            }
4326         }
4327#endif
4328      }
4329   }
4330
4331#if 0
4332   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4333   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4334      /*
4335       * Scale back depth component.
4336       */
4337      for (i = 0; i < 4; i++)
4338         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4339   }
4340#endif
4341
4342   /* Strictly speaking, these assertions aren't really needed but they
4343    * can potentially catch some bugs in the control flow code.
4344    */
4345   assert(mach->CondStackTop == 0);
4346   assert(mach->LoopStackTop == 0);
4347   assert(mach->ContStackTop == 0);
4348   assert(mach->SwitchStackTop == 0);
4349   assert(mach->BreakStackTop == 0);
4350   assert(mach->CallStackTop == 0);
4351
4352   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4353}
4354