tgsi_exec.c revision 9f3c59a35093c61fb11aab6d3ed5cb45f2b8c2a7
14e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)/**************************************************************************
24e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *
34e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
44e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * All Rights Reserved.
54e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
64e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *
74e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * Permission is hereby granted, free of charge, to any person obtaining a
84e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * copy of this software and associated documentation files (the
94e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * "Software"), to deal in the Software without restriction, including
10116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch * without limitation the rights to use, copy, modify, merge, publish,
1123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) * distribute, sub license, and/or sell copies of the Software, and to
124e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * permit persons to whom the Software is furnished to do so, subject to
134e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * the following conditions:
144e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *
154e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * The above copyright notice and this permission notice (including the
164e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * next paragraph) shall be included in all copies or substantial portions
174e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * of the Software.
184e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *
194e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
224e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
234e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
244e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
264e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *
274e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) **************************************************************************/
284e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)
294e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)/**
304e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * TGSI interpreter/executor.
31116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch *
324e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * Flow control information:
334e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *
344e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
354e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
3623730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) * care since a condition may be true for some quad components but false
3723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles) * for other components.
384e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *
395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) * We basically execute all statements (even if they're in the part of
405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) * an IF/ELSE clause that's "not taken") and use a special mask to
414e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * control writing to destination registers.  This is the ExecMask.
424e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * See store_dest().
434e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *
444e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * The ExecMask is computed from three other masks (CondMask, LoopMask and
454e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * ContMask) which are controlled by the flow control instructions (namely:
464e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
474e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *
484e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *
494e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) * Authors:
505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) *   Michal Krol
514e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) *   Brian Paul
524e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) */
534e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)
544e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)#include "pipe/p_compiler.h"
554e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)#include "pipe/p_state.h"
564e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)#include "pipe/p_shader_tokens.h"
574e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)#include "tgsi/tgsi_dump.h"
584e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)#include "tgsi/tgsi_parse.h"
594e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_sgt(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src0,
380          const union tgsi_exec_channel *src1)
381{
382   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
383   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
384   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
385   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
386}
387
388static void
389micro_sin(union tgsi_exec_channel *dst,
390          const union tgsi_exec_channel *src)
391{
392   dst->f[0] = sinf(src->f[0]);
393   dst->f[1] = sinf(src->f[1]);
394   dst->f[2] = sinf(src->f[2]);
395   dst->f[3] = sinf(src->f[3]);
396}
397
398static void
399micro_sle(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src0,
401          const union tgsi_exec_channel *src1)
402{
403   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
404   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
405   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
406   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
407}
408
409static void
410micro_slt(union tgsi_exec_channel *dst,
411          const union tgsi_exec_channel *src0,
412          const union tgsi_exec_channel *src1)
413{
414   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
415   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
416   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
417   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
418}
419
420static void
421micro_sne(union tgsi_exec_channel *dst,
422          const union tgsi_exec_channel *src0,
423          const union tgsi_exec_channel *src1)
424{
425   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
426   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
427   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
428   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
429}
430
431static void
432micro_sfl(union tgsi_exec_channel *dst)
433{
434   dst->f[0] = 0.0f;
435   dst->f[1] = 0.0f;
436   dst->f[2] = 0.0f;
437   dst->f[3] = 0.0f;
438}
439
440static void
441micro_str(union tgsi_exec_channel *dst)
442{
443   dst->f[0] = 1.0f;
444   dst->f[1] = 1.0f;
445   dst->f[2] = 1.0f;
446   dst->f[3] = 1.0f;
447}
448
449static void
450micro_trunc(union tgsi_exec_channel *dst,
451            const union tgsi_exec_channel *src)
452{
453   dst->f[0] = (float)(int)src->f[0];
454   dst->f[1] = (float)(int)src->f[1];
455   dst->f[2] = (float)(int)src->f[2];
456   dst->f[3] = (float)(int)src->f[3];
457}
458
459
460#define CHAN_X  0
461#define CHAN_Y  1
462#define CHAN_Z  2
463#define CHAN_W  3
464
465enum tgsi_exec_datatype {
466   TGSI_EXEC_DATA_FLOAT,
467   TGSI_EXEC_DATA_INT,
468   TGSI_EXEC_DATA_UINT
469};
470
471/*
472 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
473 */
474#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
475#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
476#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
477#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
478#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
479#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
480
481
482/** The execution mask depends on the conditional mask and the loop mask */
483#define UPDATE_EXEC_MASK(MACH) \
484      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
485
486
487static const union tgsi_exec_channel ZeroVec =
488   { { 0.0, 0.0, 0.0, 0.0 } };
489
490static const union tgsi_exec_channel OneVec = {
491   {1.0f, 1.0f, 1.0f, 1.0f}
492};
493
494static const union tgsi_exec_channel P128Vec = {
495   {128.0f, 128.0f, 128.0f, 128.0f}
496};
497
498static const union tgsi_exec_channel M128Vec = {
499   {-128.0f, -128.0f, -128.0f, -128.0f}
500};
501
502
503/**
504 * Assert that none of the float values in 'chan' are infinite or NaN.
505 * NaN and Inf may occur normally during program execution and should
506 * not lead to crashes, etc.  But when debugging, it's helpful to catch
507 * them.
508 */
509static INLINE void
510check_inf_or_nan(const union tgsi_exec_channel *chan)
511{
512   assert(!util_is_inf_or_nan((chan)->f[0]));
513   assert(!util_is_inf_or_nan((chan)->f[1]));
514   assert(!util_is_inf_or_nan((chan)->f[2]));
515   assert(!util_is_inf_or_nan((chan)->f[3]));
516}
517
518
519#ifdef DEBUG
520static void
521print_chan(const char *msg, const union tgsi_exec_channel *chan)
522{
523   debug_printf("%s = {%f, %f, %f, %f}\n",
524                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
525}
526#endif
527
528
529#ifdef DEBUG
530static void
531print_temp(const struct tgsi_exec_machine *mach, uint index)
532{
533   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
534   int i;
535   debug_printf("Temp[%u] =\n", index);
536   for (i = 0; i < 4; i++) {
537      debug_printf("  %c: { %f, %f, %f, %f }\n",
538                   "XYZW"[i],
539                   tmp->xyzw[i].f[0],
540                   tmp->xyzw[i].f[1],
541                   tmp->xyzw[i].f[2],
542                   tmp->xyzw[i].f[3]);
543   }
544}
545#endif
546
547
548void
549tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
550                               unsigned num_bufs,
551                               const void **bufs,
552                               const unsigned *buf_sizes)
553{
554   unsigned i;
555
556   for (i = 0; i < num_bufs; i++) {
557      mach->Consts[i] = bufs[i];
558      mach->ConstsSize[i] = buf_sizes[i];
559   }
560}
561
562
563/**
564 * Check if there's a potential src/dst register data dependency when
565 * using SOA execution.
566 * Example:
567 *   MOV T, T.yxwz;
568 * This would expand into:
569 *   MOV t0, t1;
570 *   MOV t1, t0;
571 *   MOV t2, t3;
572 *   MOV t3, t2;
573 * The second instruction will have the wrong value for t0 if executed as-is.
574 */
575boolean
576tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
577{
578   uint i, chan;
579
580   uint writemask = inst->Dst[0].Register.WriteMask;
581   if (writemask == TGSI_WRITEMASK_X ||
582       writemask == TGSI_WRITEMASK_Y ||
583       writemask == TGSI_WRITEMASK_Z ||
584       writemask == TGSI_WRITEMASK_W ||
585       writemask == TGSI_WRITEMASK_NONE) {
586      /* no chance of data dependency */
587      return FALSE;
588   }
589
590   /* loop over src regs */
591   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
592      if ((inst->Src[i].Register.File ==
593           inst->Dst[0].Register.File) &&
594          ((inst->Src[i].Register.Index ==
595            inst->Dst[0].Register.Index) ||
596           inst->Src[i].Register.Indirect ||
597           inst->Dst[0].Register.Indirect)) {
598         /* loop over dest channels */
599         uint channelsWritten = 0x0;
600         for (chan = 0; chan < NUM_CHANNELS; chan++) {
601            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
602               /* check if we're reading a channel that's been written */
603               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
604               if (channelsWritten & (1 << swizzle)) {
605                  return TRUE;
606               }
607
608               channelsWritten |= (1 << chan);
609            }
610         }
611      }
612   }
613   return FALSE;
614}
615
616
617/**
618 * Initialize machine state by expanding tokens to full instructions,
619 * allocating temporary storage, setting up constants, etc.
620 * After this, we can call tgsi_exec_machine_run() many times.
621 */
622void
623tgsi_exec_machine_bind_shader(
624   struct tgsi_exec_machine *mach,
625   const struct tgsi_token *tokens,
626   uint numSamplers,
627   struct tgsi_sampler **samplers)
628{
629   uint k;
630   struct tgsi_parse_context parse;
631   struct tgsi_full_instruction *instructions;
632   struct tgsi_full_declaration *declarations;
633   uint maxInstructions = 10, numInstructions = 0;
634   uint maxDeclarations = 10, numDeclarations = 0;
635
636#if 0
637   tgsi_dump(tokens, 0);
638#endif
639
640   util_init_math();
641
642   if (numSamplers) {
643      assert(samplers);
644   }
645
646   mach->Tokens = tokens;
647   mach->Samplers = samplers;
648
649   if (!tokens) {
650      /* unbind and free all */
651      if (mach->Declarations) {
652         FREE( mach->Declarations );
653      }
654      mach->Declarations = NULL;
655      mach->NumDeclarations = 0;
656
657      if (mach->Instructions) {
658         FREE( mach->Instructions );
659      }
660      mach->Instructions = NULL;
661      mach->NumInstructions = 0;
662
663      return;
664   }
665
666   k = tgsi_parse_init (&parse, mach->Tokens);
667   if (k != TGSI_PARSE_OK) {
668      debug_printf( "Problem parsing!\n" );
669      return;
670   }
671
672   mach->Processor = parse.FullHeader.Processor.Processor;
673   mach->ImmLimit = 0;
674
675   if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
676       !mach->UsedGeometryShader) {
677      struct tgsi_exec_vector *inputs =
678         align_malloc(sizeof(struct tgsi_exec_vector) *
679                      TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
680                      16);
681      struct tgsi_exec_vector *outputs =
682         align_malloc(sizeof(struct tgsi_exec_vector) *
683                      TGSI_MAX_TOTAL_VERTICES, 16);
684
685      if (!inputs)
686         return;
687      if (!outputs) {
688         align_free(inputs);
689         return;
690      }
691
692      align_free(mach->Inputs);
693      align_free(mach->Outputs);
694
695      mach->Inputs = inputs;
696      mach->Outputs = outputs;
697      mach->UsedGeometryShader = TRUE;
698   }
699
700   declarations = (struct tgsi_full_declaration *)
701      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
702
703   if (!declarations) {
704      return;
705   }
706
707   instructions = (struct tgsi_full_instruction *)
708      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
709
710   if (!instructions) {
711      FREE( declarations );
712      return;
713   }
714
715   while( !tgsi_parse_end_of_tokens( &parse ) ) {
716      uint i;
717
718      tgsi_parse_token( &parse );
719      switch( parse.FullToken.Token.Type ) {
720      case TGSI_TOKEN_TYPE_DECLARATION:
721         /* save expanded declaration */
722         if (numDeclarations == maxDeclarations) {
723            declarations = REALLOC(declarations,
724                                   maxDeclarations
725                                   * sizeof(struct tgsi_full_declaration),
726                                   (maxDeclarations + 10)
727                                   * sizeof(struct tgsi_full_declaration));
728            maxDeclarations += 10;
729         }
730         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
731            unsigned reg;
732            for (reg = parse.FullToken.FullDeclaration.Range.First;
733                 reg <= parse.FullToken.FullDeclaration.Range.Last;
734                 ++reg) {
735               ++mach->NumOutputs;
736            }
737         }
738         if (parse.FullToken.FullDeclaration.Declaration.File ==
739             TGSI_FILE_IMMEDIATE_ARRAY) {
740            unsigned reg;
741            struct tgsi_full_declaration *decl =
742               &parse.FullToken.FullDeclaration;
743            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
744            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
745               for( i = 0; i < 4; i++ ) {
746                  int idx = reg * 4 + i;
747                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
748               }
749            }
750         }
751         memcpy(declarations + numDeclarations,
752                &parse.FullToken.FullDeclaration,
753                sizeof(declarations[0]));
754         numDeclarations++;
755         break;
756
757      case TGSI_TOKEN_TYPE_IMMEDIATE:
758         {
759            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
760            assert( size <= 4 );
761            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
762
763            for( i = 0; i < size; i++ ) {
764               mach->Imms[mach->ImmLimit][i] =
765		  parse.FullToken.FullImmediate.u[i].Float;
766            }
767            mach->ImmLimit += 1;
768         }
769         break;
770
771      case TGSI_TOKEN_TYPE_INSTRUCTION:
772
773         /* save expanded instruction */
774         if (numInstructions == maxInstructions) {
775            instructions = REALLOC(instructions,
776                                   maxInstructions
777                                   * sizeof(struct tgsi_full_instruction),
778                                   (maxInstructions + 10)
779                                   * sizeof(struct tgsi_full_instruction));
780            maxInstructions += 10;
781         }
782
783         memcpy(instructions + numInstructions,
784                &parse.FullToken.FullInstruction,
785                sizeof(instructions[0]));
786
787         numInstructions++;
788         break;
789
790      case TGSI_TOKEN_TYPE_PROPERTY:
791         break;
792
793      default:
794         assert( 0 );
795      }
796   }
797   tgsi_parse_free (&parse);
798
799   if (mach->Declarations) {
800      FREE( mach->Declarations );
801   }
802   mach->Declarations = declarations;
803   mach->NumDeclarations = numDeclarations;
804
805   if (mach->Instructions) {
806      FREE( mach->Instructions );
807   }
808   mach->Instructions = instructions;
809   mach->NumInstructions = numInstructions;
810}
811
812
813struct tgsi_exec_machine *
814tgsi_exec_machine_create( void )
815{
816   struct tgsi_exec_machine *mach;
817   uint i;
818
819   mach = align_malloc( sizeof *mach, 16 );
820   if (!mach)
821      goto fail;
822
823   memset(mach, 0, sizeof(*mach));
824
825   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
826   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
827   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
828
829   mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
830   mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
831   if (!mach->Inputs || !mach->Outputs)
832      goto fail;
833
834   /* Setup constants needed by the SSE2 executor. */
835   for( i = 0; i < 4; i++ ) {
836      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
837      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
838      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
839      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
840      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
841      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
842      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
843      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
844      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
845      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
846   }
847
848#ifdef DEBUG
849   /* silence warnings */
850   (void) print_chan;
851   (void) print_temp;
852#endif
853
854   return mach;
855
856fail:
857   if (mach) {
858      align_free(mach->Inputs);
859      align_free(mach->Outputs);
860      align_free(mach);
861   }
862   return NULL;
863}
864
865
866void
867tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
868{
869   if (mach) {
870      if (mach->Instructions)
871         FREE(mach->Instructions);
872      if (mach->Declarations)
873         FREE(mach->Declarations);
874
875      align_free(mach->Inputs);
876      align_free(mach->Outputs);
877
878      align_free(mach);
879   }
880}
881
882static void
883micro_add(union tgsi_exec_channel *dst,
884          const union tgsi_exec_channel *src0,
885          const union tgsi_exec_channel *src1)
886{
887   dst->f[0] = src0->f[0] + src1->f[0];
888   dst->f[1] = src0->f[1] + src1->f[1];
889   dst->f[2] = src0->f[2] + src1->f[2];
890   dst->f[3] = src0->f[3] + src1->f[3];
891}
892
893static void
894micro_div(
895   union tgsi_exec_channel *dst,
896   const union tgsi_exec_channel *src0,
897   const union tgsi_exec_channel *src1 )
898{
899   if (src1->f[0] != 0) {
900      dst->f[0] = src0->f[0] / src1->f[0];
901   }
902   if (src1->f[1] != 0) {
903      dst->f[1] = src0->f[1] / src1->f[1];
904   }
905   if (src1->f[2] != 0) {
906      dst->f[2] = src0->f[2] / src1->f[2];
907   }
908   if (src1->f[3] != 0) {
909      dst->f[3] = src0->f[3] / src1->f[3];
910   }
911}
912
913static void
914micro_rcc(union tgsi_exec_channel *dst,
915          const union tgsi_exec_channel *src)
916{
917   uint i;
918
919   for (i = 0; i < 4; i++) {
920      float recip = 1.0f / src->f[i];
921
922      if (recip > 0.0f) {
923         if (recip > 1.884467e+019f) {
924            dst->f[i] = 1.884467e+019f;
925         }
926         else if (recip < 5.42101e-020f) {
927            dst->f[i] = 5.42101e-020f;
928         }
929         else {
930            dst->f[i] = recip;
931         }
932      }
933      else {
934         if (recip < -1.884467e+019f) {
935            dst->f[i] = -1.884467e+019f;
936         }
937         else if (recip > -5.42101e-020f) {
938            dst->f[i] = -5.42101e-020f;
939         }
940         else {
941            dst->f[i] = recip;
942         }
943      }
944   }
945}
946
947static void
948micro_lt(
949   union tgsi_exec_channel *dst,
950   const union tgsi_exec_channel *src0,
951   const union tgsi_exec_channel *src1,
952   const union tgsi_exec_channel *src2,
953   const union tgsi_exec_channel *src3 )
954{
955   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
956   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
957   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
958   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
959}
960
961static void
962micro_max(union tgsi_exec_channel *dst,
963          const union tgsi_exec_channel *src0,
964          const union tgsi_exec_channel *src1)
965{
966   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
967   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
968   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
969   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
970}
971
972static void
973micro_min(union tgsi_exec_channel *dst,
974          const union tgsi_exec_channel *src0,
975          const union tgsi_exec_channel *src1)
976{
977   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
978   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
979   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
980   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
981}
982
983static void
984micro_mul(union tgsi_exec_channel *dst,
985          const union tgsi_exec_channel *src0,
986          const union tgsi_exec_channel *src1)
987{
988   dst->f[0] = src0->f[0] * src1->f[0];
989   dst->f[1] = src0->f[1] * src1->f[1];
990   dst->f[2] = src0->f[2] * src1->f[2];
991   dst->f[3] = src0->f[3] * src1->f[3];
992}
993
994static void
995micro_neg(
996   union tgsi_exec_channel *dst,
997   const union tgsi_exec_channel *src )
998{
999   dst->f[0] = -src->f[0];
1000   dst->f[1] = -src->f[1];
1001   dst->f[2] = -src->f[2];
1002   dst->f[3] = -src->f[3];
1003}
1004
1005static void
1006micro_pow(
1007   union tgsi_exec_channel *dst,
1008   const union tgsi_exec_channel *src0,
1009   const union tgsi_exec_channel *src1 )
1010{
1011#if FAST_MATH
1012   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1013   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1014   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1015   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1016#else
1017   dst->f[0] = powf( src0->f[0], src1->f[0] );
1018   dst->f[1] = powf( src0->f[1], src1->f[1] );
1019   dst->f[2] = powf( src0->f[2], src1->f[2] );
1020   dst->f[3] = powf( src0->f[3], src1->f[3] );
1021#endif
1022}
1023
1024static void
1025micro_sub(union tgsi_exec_channel *dst,
1026          const union tgsi_exec_channel *src0,
1027          const union tgsi_exec_channel *src1)
1028{
1029   dst->f[0] = src0->f[0] - src1->f[0];
1030   dst->f[1] = src0->f[1] - src1->f[1];
1031   dst->f[2] = src0->f[2] - src1->f[2];
1032   dst->f[3] = src0->f[3] - src1->f[3];
1033}
1034
1035static void
1036fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1037                       const uint file,
1038                       const uint swizzle,
1039                       const union tgsi_exec_channel *index,
1040                       const union tgsi_exec_channel *index2D,
1041                       union tgsi_exec_channel *chan)
1042{
1043   uint i;
1044
1045   assert(swizzle < 4);
1046
1047   switch (file) {
1048   case TGSI_FILE_CONSTANT:
1049      for (i = 0; i < QUAD_SIZE; i++) {
1050         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1051         assert(mach->Consts[index2D->i[i]]);
1052
1053         if (index->i[i] < 0) {
1054            chan->u[i] = 0;
1055         } else {
1056            /* NOTE: copying the const value as a uint instead of float */
1057            const uint constbuf = index2D->i[i];
1058            const uint *buf = (const uint *)mach->Consts[constbuf];
1059            const int pos = index->i[i] * 4 + swizzle;
1060            /* const buffer bounds check */
1061            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1062               if (0) {
1063                  /* Debug: print warning */
1064                  static int count = 0;
1065                  if (count++ < 100)
1066                     debug_printf("TGSI Exec: const buffer index %d"
1067                                  " out of bounds\n", pos);
1068               }
1069               chan->u[i] = 0;
1070            }
1071            else
1072               chan->u[i] = buf[pos];
1073         }
1074      }
1075      break;
1076
1077   case TGSI_FILE_INPUT:
1078      for (i = 0; i < QUAD_SIZE; i++) {
1079         /*
1080         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1081            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1082                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1083                         index2D->i[i], index->i[i]);
1084                         }*/
1085         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1086         assert(pos >= 0);
1087         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1088         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1089      }
1090      break;
1091
1092   case TGSI_FILE_SYSTEM_VALUE:
1093      /* XXX no swizzling at this point.  Will be needed if we put
1094       * gl_FragCoord, for example, in a sys value register.
1095       */
1096      for (i = 0; i < QUAD_SIZE; i++) {
1097         chan->f[i] = mach->SystemValue[index->i[i]][0];
1098      }
1099      break;
1100
1101   case TGSI_FILE_TEMPORARY:
1102      for (i = 0; i < QUAD_SIZE; i++) {
1103         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1104         assert(index2D->i[i] == 0);
1105
1106         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1107      }
1108      break;
1109
1110   case TGSI_FILE_TEMPORARY_ARRAY:
1111      for (i = 0; i < QUAD_SIZE; i++) {
1112         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1113         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1114
1115         chan->u[i] =
1116            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1117      }
1118      break;
1119
1120   case TGSI_FILE_IMMEDIATE:
1121      for (i = 0; i < QUAD_SIZE; i++) {
1122         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1123         assert(index2D->i[i] == 0);
1124
1125         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1126      }
1127      break;
1128
1129   case TGSI_FILE_IMMEDIATE_ARRAY:
1130      for (i = 0; i < QUAD_SIZE; i++) {
1131         assert(index2D->i[i] == 0);
1132
1133         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1134      }
1135      break;
1136
1137   case TGSI_FILE_ADDRESS:
1138      for (i = 0; i < QUAD_SIZE; i++) {
1139         assert(index->i[i] >= 0);
1140         assert(index2D->i[i] == 0);
1141
1142         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1143      }
1144      break;
1145
1146   case TGSI_FILE_PREDICATE:
1147      for (i = 0; i < QUAD_SIZE; i++) {
1148         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1149         assert(index2D->i[i] == 0);
1150
1151         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1152      }
1153      break;
1154
1155   case TGSI_FILE_OUTPUT:
1156      /* vertex/fragment output vars can be read too */
1157      for (i = 0; i < QUAD_SIZE; i++) {
1158         assert(index->i[i] >= 0);
1159         assert(index2D->i[i] == 0);
1160
1161         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1162      }
1163      break;
1164
1165   default:
1166      assert(0);
1167      for (i = 0; i < QUAD_SIZE; i++) {
1168         chan->u[i] = 0;
1169      }
1170   }
1171}
1172
1173static void
1174fetch_source(const struct tgsi_exec_machine *mach,
1175             union tgsi_exec_channel *chan,
1176             const struct tgsi_full_src_register *reg,
1177             const uint chan_index,
1178             enum tgsi_exec_datatype src_datatype)
1179{
1180   union tgsi_exec_channel index;
1181   union tgsi_exec_channel index2D;
1182   uint swizzle;
1183
1184   /* We start with a direct index into a register file.
1185    *
1186    *    file[1],
1187    *    where:
1188    *       file = Register.File
1189    *       [1] = Register.Index
1190    */
1191   index.i[0] =
1192   index.i[1] =
1193   index.i[2] =
1194   index.i[3] = reg->Register.Index;
1195
1196   /* There is an extra source register that indirectly subscripts
1197    * a register file. The direct index now becomes an offset
1198    * that is being added to the indirect register.
1199    *
1200    *    file[ind[2].x+1],
1201    *    where:
1202    *       ind = Indirect.File
1203    *       [2] = Indirect.Index
1204    *       .x = Indirect.SwizzleX
1205    */
1206   if (reg->Register.Indirect) {
1207      union tgsi_exec_channel index2;
1208      union tgsi_exec_channel indir_index;
1209      const uint execmask = mach->ExecMask;
1210      uint i;
1211
1212      /* which address register (always zero now) */
1213      index2.i[0] =
1214      index2.i[1] =
1215      index2.i[2] =
1216      index2.i[3] = reg->Indirect.Index;
1217      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1218      /* get current value of address register[swizzle] */
1219      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1220      fetch_src_file_channel(mach,
1221                             reg->Indirect.File,
1222                             swizzle,
1223                             &index2,
1224                             &ZeroVec,
1225                             &indir_index);
1226
1227      /* add value of address register to the offset */
1228      index.i[0] += indir_index.i[0];
1229      index.i[1] += indir_index.i[1];
1230      index.i[2] += indir_index.i[2];
1231      index.i[3] += indir_index.i[3];
1232
1233      /* for disabled execution channels, zero-out the index to
1234       * avoid using a potential garbage value.
1235       */
1236      for (i = 0; i < QUAD_SIZE; i++) {
1237         if ((execmask & (1 << i)) == 0)
1238            index.i[i] = 0;
1239      }
1240   }
1241
1242   /* There is an extra source register that is a second
1243    * subscript to a register file. Effectively it means that
1244    * the register file is actually a 2D array of registers.
1245    *
1246    *    file[3][1],
1247    *    where:
1248    *       [3] = Dimension.Index
1249    */
1250   if (reg->Register.Dimension) {
1251      index2D.i[0] =
1252      index2D.i[1] =
1253      index2D.i[2] =
1254      index2D.i[3] = reg->Dimension.Index;
1255
1256      /* Again, the second subscript index can be addressed indirectly
1257       * identically to the first one.
1258       * Nothing stops us from indirectly addressing the indirect register,
1259       * but there is no need for that, so we won't exercise it.
1260       *
1261       *    file[ind[4].y+3][1],
1262       *    where:
1263       *       ind = DimIndirect.File
1264       *       [4] = DimIndirect.Index
1265       *       .y = DimIndirect.SwizzleX
1266       */
1267      if (reg->Dimension.Indirect) {
1268         union tgsi_exec_channel index2;
1269         union tgsi_exec_channel indir_index;
1270         const uint execmask = mach->ExecMask;
1271         uint i;
1272
1273         index2.i[0] =
1274         index2.i[1] =
1275         index2.i[2] =
1276         index2.i[3] = reg->DimIndirect.Index;
1277
1278         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1279         fetch_src_file_channel(mach,
1280                                reg->DimIndirect.File,
1281                                swizzle,
1282                                &index2,
1283                                &ZeroVec,
1284                                &indir_index);
1285
1286         index2D.i[0] += indir_index.i[0];
1287         index2D.i[1] += indir_index.i[1];
1288         index2D.i[2] += indir_index.i[2];
1289         index2D.i[3] += indir_index.i[3];
1290
1291         /* for disabled execution channels, zero-out the index to
1292          * avoid using a potential garbage value.
1293          */
1294         for (i = 0; i < QUAD_SIZE; i++) {
1295            if ((execmask & (1 << i)) == 0) {
1296               index2D.i[i] = 0;
1297            }
1298         }
1299      }
1300
1301      /* If by any chance there was a need for a 3D array of register
1302       * files, we would have to check whether Dimension is followed
1303       * by a dimension register and continue the saga.
1304       */
1305   } else {
1306      index2D.i[0] =
1307      index2D.i[1] =
1308      index2D.i[2] =
1309      index2D.i[3] = 0;
1310   }
1311
1312   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1313   fetch_src_file_channel(mach,
1314                          reg->Register.File,
1315                          swizzle,
1316                          &index,
1317                          &index2D,
1318                          chan);
1319
1320   if (reg->Register.Absolute) {
1321      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1322         micro_abs(chan, chan);
1323      } else {
1324         micro_iabs(chan, chan);
1325      }
1326   }
1327
1328   if (reg->Register.Negate) {
1329      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1330         micro_neg(chan, chan);
1331      } else {
1332         micro_ineg(chan, chan);
1333      }
1334   }
1335}
1336
1337static void
1338store_dest(struct tgsi_exec_machine *mach,
1339           const union tgsi_exec_channel *chan,
1340           const struct tgsi_full_dst_register *reg,
1341           const struct tgsi_full_instruction *inst,
1342           uint chan_index,
1343           enum tgsi_exec_datatype dst_datatype)
1344{
1345   uint i;
1346   union tgsi_exec_channel null;
1347   union tgsi_exec_channel *dst;
1348   union tgsi_exec_channel index2D;
1349   uint execmask = mach->ExecMask;
1350   int offset = 0;  /* indirection offset */
1351   int index;
1352
1353   /* for debugging */
1354   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1355      check_inf_or_nan(chan);
1356   }
1357
1358   /* There is an extra source register that indirectly subscripts
1359    * a register file. The direct index now becomes an offset
1360    * that is being added to the indirect register.
1361    *
1362    *    file[ind[2].x+1],
1363    *    where:
1364    *       ind = Indirect.File
1365    *       [2] = Indirect.Index
1366    *       .x = Indirect.SwizzleX
1367    */
1368   if (reg->Register.Indirect) {
1369      union tgsi_exec_channel index;
1370      union tgsi_exec_channel indir_index;
1371      uint swizzle;
1372
1373      /* which address register (always zero for now) */
1374      index.i[0] =
1375      index.i[1] =
1376      index.i[2] =
1377      index.i[3] = reg->Indirect.Index;
1378
1379      /* get current value of address register[swizzle] */
1380      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1381
1382      /* fetch values from the address/indirection register */
1383      fetch_src_file_channel(mach,
1384                             reg->Indirect.File,
1385                             swizzle,
1386                             &index,
1387                             &ZeroVec,
1388                             &indir_index);
1389
1390      /* save indirection offset */
1391      offset = indir_index.i[0];
1392   }
1393
1394   /* There is an extra source register that is a second
1395    * subscript to a register file. Effectively it means that
1396    * the register file is actually a 2D array of registers.
1397    *
1398    *    file[3][1],
1399    *    where:
1400    *       [3] = Dimension.Index
1401    */
1402   if (reg->Register.Dimension) {
1403      index2D.i[0] =
1404      index2D.i[1] =
1405      index2D.i[2] =
1406      index2D.i[3] = reg->Dimension.Index;
1407
1408      /* Again, the second subscript index can be addressed indirectly
1409       * identically to the first one.
1410       * Nothing stops us from indirectly addressing the indirect register,
1411       * but there is no need for that, so we won't exercise it.
1412       *
1413       *    file[ind[4].y+3][1],
1414       *    where:
1415       *       ind = DimIndirect.File
1416       *       [4] = DimIndirect.Index
1417       *       .y = DimIndirect.SwizzleX
1418       */
1419      if (reg->Dimension.Indirect) {
1420         union tgsi_exec_channel index2;
1421         union tgsi_exec_channel indir_index;
1422         const uint execmask = mach->ExecMask;
1423         unsigned swizzle;
1424         uint i;
1425
1426         index2.i[0] =
1427         index2.i[1] =
1428         index2.i[2] =
1429         index2.i[3] = reg->DimIndirect.Index;
1430
1431         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1432         fetch_src_file_channel(mach,
1433                                reg->DimIndirect.File,
1434                                swizzle,
1435                                &index2,
1436                                &ZeroVec,
1437                                &indir_index);
1438
1439         index2D.i[0] += indir_index.i[0];
1440         index2D.i[1] += indir_index.i[1];
1441         index2D.i[2] += indir_index.i[2];
1442         index2D.i[3] += indir_index.i[3];
1443
1444         /* for disabled execution channels, zero-out the index to
1445          * avoid using a potential garbage value.
1446          */
1447         for (i = 0; i < QUAD_SIZE; i++) {
1448            if ((execmask & (1 << i)) == 0) {
1449               index2D.i[i] = 0;
1450            }
1451         }
1452      }
1453
1454      /* If by any chance there was a need for a 3D array of register
1455       * files, we would have to check whether Dimension is followed
1456       * by a dimension register and continue the saga.
1457       */
1458   } else {
1459      index2D.i[0] =
1460      index2D.i[1] =
1461      index2D.i[2] =
1462      index2D.i[3] = 0;
1463   }
1464
1465   switch (reg->Register.File) {
1466   case TGSI_FILE_NULL:
1467      dst = &null;
1468      break;
1469
1470   case TGSI_FILE_OUTPUT:
1471      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1472         + reg->Register.Index;
1473      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1474#if 0
1475      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1476         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1477         for (i = 0; i < QUAD_SIZE; i++)
1478            if (execmask & (1 << i))
1479               fprintf(stderr, "%f, ", chan->f[i]);
1480         fprintf(stderr, ")\n");
1481      }
1482#endif
1483      break;
1484
1485   case TGSI_FILE_TEMPORARY:
1486      index = reg->Register.Index;
1487      assert( index < TGSI_EXEC_NUM_TEMPS );
1488      dst = &mach->Temps[offset + index].xyzw[chan_index];
1489      break;
1490
1491   case TGSI_FILE_TEMPORARY_ARRAY:
1492      index = reg->Register.Index;
1493      assert( index < TGSI_EXEC_NUM_TEMPS );
1494      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1495      /* XXX we use index2D.i[0] here but somehow we might
1496       * end up with someone trying to store indirectly in
1497       * different buffers */
1498      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1499      break;
1500
1501   case TGSI_FILE_ADDRESS:
1502      index = reg->Register.Index;
1503      dst = &mach->Addrs[index].xyzw[chan_index];
1504      break;
1505
1506   case TGSI_FILE_PREDICATE:
1507      index = reg->Register.Index;
1508      assert(index < TGSI_EXEC_NUM_PREDS);
1509      dst = &mach->Predicates[index].xyzw[chan_index];
1510      break;
1511
1512   default:
1513      assert( 0 );
1514      return;
1515   }
1516
1517   if (inst->Instruction.Predicate) {
1518      uint swizzle;
1519      union tgsi_exec_channel *pred;
1520
1521      switch (chan_index) {
1522      case CHAN_X:
1523         swizzle = inst->Predicate.SwizzleX;
1524         break;
1525      case CHAN_Y:
1526         swizzle = inst->Predicate.SwizzleY;
1527         break;
1528      case CHAN_Z:
1529         swizzle = inst->Predicate.SwizzleZ;
1530         break;
1531      case CHAN_W:
1532         swizzle = inst->Predicate.SwizzleW;
1533         break;
1534      default:
1535         assert(0);
1536         return;
1537      }
1538
1539      assert(inst->Predicate.Index == 0);
1540
1541      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1542
1543      if (inst->Predicate.Negate) {
1544         for (i = 0; i < QUAD_SIZE; i++) {
1545            if (pred->u[i]) {
1546               execmask &= ~(1 << i);
1547            }
1548         }
1549      } else {
1550         for (i = 0; i < QUAD_SIZE; i++) {
1551            if (!pred->u[i]) {
1552               execmask &= ~(1 << i);
1553            }
1554         }
1555      }
1556   }
1557
1558   switch (inst->Instruction.Saturate) {
1559   case TGSI_SAT_NONE:
1560      for (i = 0; i < QUAD_SIZE; i++)
1561         if (execmask & (1 << i))
1562            dst->i[i] = chan->i[i];
1563      break;
1564
1565   case TGSI_SAT_ZERO_ONE:
1566      for (i = 0; i < QUAD_SIZE; i++)
1567         if (execmask & (1 << i)) {
1568            if (chan->f[i] < 0.0f)
1569               dst->f[i] = 0.0f;
1570            else if (chan->f[i] > 1.0f)
1571               dst->f[i] = 1.0f;
1572            else
1573               dst->i[i] = chan->i[i];
1574         }
1575      break;
1576
1577   case TGSI_SAT_MINUS_PLUS_ONE:
1578      for (i = 0; i < QUAD_SIZE; i++)
1579         if (execmask & (1 << i)) {
1580            if (chan->f[i] < -1.0f)
1581               dst->f[i] = -1.0f;
1582            else if (chan->f[i] > 1.0f)
1583               dst->f[i] = 1.0f;
1584            else
1585               dst->i[i] = chan->i[i];
1586         }
1587      break;
1588
1589   default:
1590      assert( 0 );
1591   }
1592}
1593
1594#define FETCH(VAL,INDEX,CHAN)\
1595    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1596
1597
1598/**
1599 * Execute ARB-style KIL which is predicated by a src register.
1600 * Kill fragment if any of the four values is less than zero.
1601 */
1602static void
1603exec_kil(struct tgsi_exec_machine *mach,
1604         const struct tgsi_full_instruction *inst)
1605{
1606   uint uniquemask;
1607   uint chan_index;
1608   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1609   union tgsi_exec_channel r[1];
1610
1611   /* This mask stores component bits that were already tested. */
1612   uniquemask = 0;
1613
1614   for (chan_index = 0; chan_index < 4; chan_index++)
1615   {
1616      uint swizzle;
1617      uint i;
1618
1619      /* unswizzle channel */
1620      swizzle = tgsi_util_get_full_src_register_swizzle (
1621                        &inst->Src[0],
1622                        chan_index);
1623
1624      /* check if the component has not been already tested */
1625      if (uniquemask & (1 << swizzle))
1626         continue;
1627      uniquemask |= 1 << swizzle;
1628
1629      FETCH(&r[0], 0, chan_index);
1630      for (i = 0; i < 4; i++)
1631         if (r[0].f[i] < 0.0f)
1632            kilmask |= 1 << i;
1633   }
1634
1635   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1636}
1637
1638/**
1639 * Execute NVIDIA-style KIL which is predicated by a condition code.
1640 * Kill fragment if the condition code is TRUE.
1641 */
1642static void
1643exec_kilp(struct tgsi_exec_machine *mach,
1644          const struct tgsi_full_instruction *inst)
1645{
1646   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1647
1648   /* "unconditional" kil */
1649   kilmask = mach->ExecMask;
1650   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1651}
1652
1653static void
1654emit_vertex(struct tgsi_exec_machine *mach)
1655{
1656   /* FIXME: check for exec mask correctly
1657   unsigned i;
1658   for (i = 0; i < QUAD_SIZE; ++i) {
1659         if ((mach->ExecMask & (1 << i)))
1660   */
1661   if (mach->ExecMask) {
1662      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1663      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1664   }
1665}
1666
1667static void
1668emit_primitive(struct tgsi_exec_machine *mach)
1669{
1670   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1671   /* FIXME: check for exec mask correctly
1672   unsigned i;
1673   for (i = 0; i < QUAD_SIZE; ++i) {
1674         if ((mach->ExecMask & (1 << i)))
1675   */
1676   if (mach->ExecMask) {
1677      ++(*prim_count);
1678      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1679      mach->Primitives[*prim_count] = 0;
1680   }
1681}
1682
1683static void
1684conditional_emit_primitive(struct tgsi_exec_machine *mach)
1685{
1686   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1687      int emitted_verts =
1688         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1689      if (emitted_verts) {
1690         emit_primitive(mach);
1691      }
1692   }
1693}
1694
1695
1696/*
1697 * Fetch four texture samples using STR texture coordinates.
1698 */
1699static void
1700fetch_texel( struct tgsi_sampler *sampler,
1701             const union tgsi_exec_channel *s,
1702             const union tgsi_exec_channel *t,
1703             const union tgsi_exec_channel *p,
1704             const union tgsi_exec_channel *c0,
1705             enum tgsi_sampler_control control,
1706             union tgsi_exec_channel *r,
1707             union tgsi_exec_channel *g,
1708             union tgsi_exec_channel *b,
1709             union tgsi_exec_channel *a )
1710{
1711   uint j;
1712   float rgba[NUM_CHANNELS][QUAD_SIZE];
1713
1714   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1715
1716   for (j = 0; j < 4; j++) {
1717      r->f[j] = rgba[0][j];
1718      g->f[j] = rgba[1][j];
1719      b->f[j] = rgba[2][j];
1720      a->f[j] = rgba[3][j];
1721   }
1722}
1723
1724
1725#define TEX_MODIFIER_NONE           0
1726#define TEX_MODIFIER_PROJECTED      1
1727#define TEX_MODIFIER_LOD_BIAS       2
1728#define TEX_MODIFIER_EXPLICIT_LOD   3
1729
1730
1731static void
1732exec_tex(struct tgsi_exec_machine *mach,
1733         const struct tgsi_full_instruction *inst,
1734         uint modifier)
1735{
1736   const uint unit = inst->Src[1].Register.Index;
1737   union tgsi_exec_channel r[4];
1738   const union tgsi_exec_channel *lod = &ZeroVec;
1739   enum tgsi_sampler_control control;
1740   uint chan;
1741
1742   if (modifier != TEX_MODIFIER_NONE) {
1743      FETCH(&r[3], 0, CHAN_W);
1744      if (modifier != TEX_MODIFIER_PROJECTED) {
1745         lod = &r[3];
1746      }
1747   }
1748
1749   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1750      control = tgsi_sampler_lod_explicit;
1751   } else {
1752      control = tgsi_sampler_lod_bias;
1753   }
1754
1755   switch (inst->Texture.Texture) {
1756   case TGSI_TEXTURE_1D:
1757   case TGSI_TEXTURE_SHADOW1D:
1758      FETCH(&r[0], 0, CHAN_X);
1759
1760      if (modifier == TEX_MODIFIER_PROJECTED) {
1761         micro_div(&r[0], &r[0], &r[3]);
1762      }
1763
1764      fetch_texel(mach->Samplers[unit],
1765                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1766                  control,
1767                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1768      break;
1769
1770   case TGSI_TEXTURE_2D:
1771   case TGSI_TEXTURE_RECT:
1772   case TGSI_TEXTURE_SHADOW2D:
1773   case TGSI_TEXTURE_SHADOWRECT:
1774      FETCH(&r[0], 0, CHAN_X);
1775      FETCH(&r[1], 0, CHAN_Y);
1776      FETCH(&r[2], 0, CHAN_Z);
1777
1778      if (modifier == TEX_MODIFIER_PROJECTED) {
1779         micro_div(&r[0], &r[0], &r[3]);
1780         micro_div(&r[1], &r[1], &r[3]);
1781         micro_div(&r[2], &r[2], &r[3]);
1782      }
1783
1784      fetch_texel(mach->Samplers[unit],
1785                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1786                  control,
1787                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1788      break;
1789
1790   case TGSI_TEXTURE_1D_ARRAY:
1791      FETCH(&r[0], 0, CHAN_X);
1792      FETCH(&r[1], 0, CHAN_Y);
1793
1794      if (modifier == TEX_MODIFIER_PROJECTED) {
1795         micro_div(&r[0], &r[0], &r[3]);
1796      }
1797
1798      fetch_texel(mach->Samplers[unit],
1799                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1800                  control,
1801                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1802      break;
1803
1804   case TGSI_TEXTURE_2D_ARRAY:
1805      FETCH(&r[0], 0, CHAN_X);
1806      FETCH(&r[1], 0, CHAN_Y);
1807      FETCH(&r[2], 0, CHAN_Z);
1808
1809      if (modifier == TEX_MODIFIER_PROJECTED) {
1810         micro_div(&r[0], &r[0], &r[3]);
1811         micro_div(&r[1], &r[1], &r[3]);
1812      }
1813
1814      fetch_texel(mach->Samplers[unit],
1815                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1816                  control,
1817                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1818      break;
1819
1820   case TGSI_TEXTURE_3D:
1821   case TGSI_TEXTURE_CUBE:
1822      FETCH(&r[0], 0, CHAN_X);
1823      FETCH(&r[1], 0, CHAN_Y);
1824      FETCH(&r[2], 0, CHAN_Z);
1825
1826      if (modifier == TEX_MODIFIER_PROJECTED) {
1827         micro_div(&r[0], &r[0], &r[3]);
1828         micro_div(&r[1], &r[1], &r[3]);
1829         micro_div(&r[2], &r[2], &r[3]);
1830      }
1831
1832      fetch_texel(mach->Samplers[unit],
1833                  &r[0], &r[1], &r[2], lod,
1834                  control,
1835                  &r[0], &r[1], &r[2], &r[3]);
1836      break;
1837
1838   default:
1839      assert(0);
1840   }
1841
1842   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1843      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1844         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1845      }
1846   }
1847}
1848
1849static void
1850exec_txd(struct tgsi_exec_machine *mach,
1851         const struct tgsi_full_instruction *inst)
1852{
1853   const uint unit = inst->Src[3].Register.Index;
1854   union tgsi_exec_channel r[4];
1855   uint chan;
1856
1857   /*
1858    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1859    */
1860
1861   switch (inst->Texture.Texture) {
1862   case TGSI_TEXTURE_1D:
1863   case TGSI_TEXTURE_SHADOW1D:
1864
1865      FETCH(&r[0], 0, CHAN_X);
1866
1867      fetch_texel(mach->Samplers[unit],
1868                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1869                  tgsi_sampler_lod_bias,
1870                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1871      break;
1872
1873   case TGSI_TEXTURE_2D:
1874   case TGSI_TEXTURE_RECT:
1875   case TGSI_TEXTURE_SHADOW2D:
1876   case TGSI_TEXTURE_SHADOWRECT:
1877
1878      FETCH(&r[0], 0, CHAN_X);
1879      FETCH(&r[1], 0, CHAN_Y);
1880      FETCH(&r[2], 0, CHAN_Z);
1881
1882      fetch_texel(mach->Samplers[unit],
1883                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1884                  tgsi_sampler_lod_bias,
1885                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1886      break;
1887
1888   case TGSI_TEXTURE_3D:
1889   case TGSI_TEXTURE_CUBE:
1890
1891      FETCH(&r[0], 0, CHAN_X);
1892      FETCH(&r[1], 0, CHAN_Y);
1893      FETCH(&r[2], 0, CHAN_Z);
1894
1895      fetch_texel(mach->Samplers[unit],
1896                  &r[0], &r[1], &r[2], &ZeroVec,
1897                  tgsi_sampler_lod_bias,
1898                  &r[0], &r[1], &r[2], &r[3]);
1899      break;
1900
1901   default:
1902      assert(0);
1903   }
1904
1905   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1906      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1907         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1908      }
1909   }
1910}
1911
1912
1913
1914static void
1915exec_sample(struct tgsi_exec_machine *mach,
1916            const struct tgsi_full_instruction *inst,
1917            uint modifier)
1918{
1919   const uint resource_unit = inst->Src[1].Register.Index;
1920   const uint sampler_unit = inst->Src[2].Register.Index;
1921   union tgsi_exec_channel r[4];
1922   const union tgsi_exec_channel *lod = &ZeroVec;
1923   enum tgsi_sampler_control control;
1924   uint chan;
1925
1926   if (modifier != TEX_MODIFIER_NONE) {
1927      if (modifier == TEX_MODIFIER_LOD_BIAS)
1928         FETCH(&r[3], 3, CHAN_X);
1929      else /*TEX_MODIFIER_LOD*/
1930         FETCH(&r[3], 0, CHAN_W);
1931
1932      if (modifier != TEX_MODIFIER_PROJECTED) {
1933         lod = &r[3];
1934      }
1935   }
1936
1937   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1938      control = tgsi_sampler_lod_explicit;
1939   } else {
1940      control = tgsi_sampler_lod_bias;
1941   }
1942
1943   switch (mach->Resources[resource_unit].Resource) {
1944   case TGSI_TEXTURE_1D:
1945   case TGSI_TEXTURE_SHADOW1D:
1946      FETCH(&r[0], 0, CHAN_X);
1947
1948      if (modifier == TEX_MODIFIER_PROJECTED) {
1949         micro_div(&r[0], &r[0], &r[3]);
1950      }
1951
1952      fetch_texel(mach->Samplers[sampler_unit],
1953                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1954                  control,
1955                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1956      break;
1957
1958   case TGSI_TEXTURE_2D:
1959   case TGSI_TEXTURE_RECT:
1960   case TGSI_TEXTURE_SHADOW2D:
1961   case TGSI_TEXTURE_SHADOWRECT:
1962      FETCH(&r[0], 0, CHAN_X);
1963      FETCH(&r[1], 0, CHAN_Y);
1964      FETCH(&r[2], 0, CHAN_Z);
1965
1966      if (modifier == TEX_MODIFIER_PROJECTED) {
1967         micro_div(&r[0], &r[0], &r[3]);
1968         micro_div(&r[1], &r[1], &r[3]);
1969         micro_div(&r[2], &r[2], &r[3]);
1970      }
1971
1972      fetch_texel(mach->Samplers[sampler_unit],
1973                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1974                  control,
1975                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1976      break;
1977
1978   case TGSI_TEXTURE_3D:
1979   case TGSI_TEXTURE_CUBE:
1980      FETCH(&r[0], 0, CHAN_X);
1981      FETCH(&r[1], 0, CHAN_Y);
1982      FETCH(&r[2], 0, CHAN_Z);
1983
1984      if (modifier == TEX_MODIFIER_PROJECTED) {
1985         micro_div(&r[0], &r[0], &r[3]);
1986         micro_div(&r[1], &r[1], &r[3]);
1987         micro_div(&r[2], &r[2], &r[3]);
1988      }
1989
1990      fetch_texel(mach->Samplers[sampler_unit],
1991                  &r[0], &r[1], &r[2], lod,
1992                  control,
1993                  &r[0], &r[1], &r[2], &r[3]);
1994      break;
1995
1996   default:
1997      assert(0);
1998   }
1999
2000   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2001      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2002         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2003      }
2004   }
2005}
2006
2007static void
2008exec_sample_d(struct tgsi_exec_machine *mach,
2009              const struct tgsi_full_instruction *inst)
2010{
2011   const uint resource_unit = inst->Src[1].Register.Index;
2012   const uint sampler_unit = inst->Src[2].Register.Index;
2013   union tgsi_exec_channel r[4];
2014   uint chan;
2015   /*
2016    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
2017    */
2018
2019   switch (mach->Resources[resource_unit].Resource) {
2020   case TGSI_TEXTURE_1D:
2021   case TGSI_TEXTURE_SHADOW1D:
2022
2023      FETCH(&r[0], 0, CHAN_X);
2024
2025      fetch_texel(mach->Samplers[sampler_unit],
2026                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
2027                  tgsi_sampler_lod_bias,
2028                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2029      break;
2030
2031   case TGSI_TEXTURE_2D:
2032   case TGSI_TEXTURE_RECT:
2033   case TGSI_TEXTURE_SHADOW2D:
2034   case TGSI_TEXTURE_SHADOWRECT:
2035
2036      FETCH(&r[0], 0, CHAN_X);
2037      FETCH(&r[1], 0, CHAN_Y);
2038      FETCH(&r[2], 0, CHAN_Z);
2039
2040      fetch_texel(mach->Samplers[sampler_unit],
2041                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2042                  tgsi_sampler_lod_bias,
2043                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2044      break;
2045
2046   case TGSI_TEXTURE_3D:
2047   case TGSI_TEXTURE_CUBE:
2048
2049      FETCH(&r[0], 0, CHAN_X);
2050      FETCH(&r[1], 0, CHAN_Y);
2051      FETCH(&r[2], 0, CHAN_Z);
2052
2053      fetch_texel(mach->Samplers[sampler_unit],
2054                  &r[0], &r[1], &r[2], &ZeroVec,
2055                  tgsi_sampler_lod_bias,
2056                  &r[0], &r[1], &r[2], &r[3]);
2057      break;
2058
2059   default:
2060      assert(0);
2061   }
2062
2063   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2064      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2065         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2066      }
2067   }
2068}
2069
2070
2071/**
2072 * Evaluate a constant-valued coefficient at the position of the
2073 * current quad.
2074 */
2075static void
2076eval_constant_coef(
2077   struct tgsi_exec_machine *mach,
2078   unsigned attrib,
2079   unsigned chan )
2080{
2081   unsigned i;
2082
2083   for( i = 0; i < QUAD_SIZE; i++ ) {
2084      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2085   }
2086}
2087
2088/**
2089 * Evaluate a linear-valued coefficient at the position of the
2090 * current quad.
2091 */
2092static void
2093eval_linear_coef(
2094   struct tgsi_exec_machine *mach,
2095   unsigned attrib,
2096   unsigned chan )
2097{
2098   const float x = mach->QuadPos.xyzw[0].f[0];
2099   const float y = mach->QuadPos.xyzw[1].f[0];
2100   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2101   const float dady = mach->InterpCoefs[attrib].dady[chan];
2102   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2103   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2104   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2105   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2106   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2107}
2108
2109/**
2110 * Evaluate a perspective-valued coefficient at the position of the
2111 * current quad.
2112 */
2113static void
2114eval_perspective_coef(
2115   struct tgsi_exec_machine *mach,
2116   unsigned attrib,
2117   unsigned chan )
2118{
2119   const float x = mach->QuadPos.xyzw[0].f[0];
2120   const float y = mach->QuadPos.xyzw[1].f[0];
2121   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2122   const float dady = mach->InterpCoefs[attrib].dady[chan];
2123   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2124   const float *w = mach->QuadPos.xyzw[3].f;
2125   /* divide by W here */
2126   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2127   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2128   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2129   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2130}
2131
2132
2133typedef void (* eval_coef_func)(
2134   struct tgsi_exec_machine *mach,
2135   unsigned attrib,
2136   unsigned chan );
2137
2138static void
2139exec_declaration(struct tgsi_exec_machine *mach,
2140                 const struct tgsi_full_declaration *decl)
2141{
2142   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
2143      mach->Resources[decl->Range.First] = decl->Resource;
2144      return;
2145   }
2146
2147   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2148      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2149         uint first, last, mask;
2150
2151         first = decl->Range.First;
2152         last = decl->Range.Last;
2153         mask = decl->Declaration.UsageMask;
2154
2155         /* XXX we could remove this special-case code since
2156          * mach->InterpCoefs[first].a0 should already have the
2157          * front/back-face value.  But we should first update the
2158          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2159          * Then, we could remove the tgsi_exec_machine::Face field.
2160          */
2161         /* XXX make FACE a system value */
2162         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2163            uint i;
2164
2165            assert(decl->Semantic.Index == 0);
2166            assert(first == last);
2167
2168            for (i = 0; i < QUAD_SIZE; i++) {
2169               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2170            }
2171         } else {
2172            eval_coef_func eval;
2173            uint i, j;
2174
2175            switch (decl->Declaration.Interpolate) {
2176            case TGSI_INTERPOLATE_CONSTANT:
2177               eval = eval_constant_coef;
2178               break;
2179
2180            case TGSI_INTERPOLATE_LINEAR:
2181               eval = eval_linear_coef;
2182               break;
2183
2184            case TGSI_INTERPOLATE_PERSPECTIVE:
2185               eval = eval_perspective_coef;
2186               break;
2187
2188            default:
2189               assert(0);
2190               return;
2191            }
2192
2193            for (j = 0; j < NUM_CHANNELS; j++) {
2194               if (mask & (1 << j)) {
2195                  for (i = first; i <= last; i++) {
2196                     eval(mach, i, j);
2197                  }
2198               }
2199            }
2200         }
2201      }
2202   }
2203
2204   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2205      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2206   }
2207}
2208
2209
2210typedef void (* micro_op)(union tgsi_exec_channel *dst);
2211
2212static void
2213exec_vector(struct tgsi_exec_machine *mach,
2214            const struct tgsi_full_instruction *inst,
2215            micro_op op,
2216            enum tgsi_exec_datatype dst_datatype)
2217{
2218   unsigned int chan;
2219
2220   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2221      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2222         union tgsi_exec_channel dst;
2223
2224         op(&dst);
2225         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2226      }
2227   }
2228}
2229
2230typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2231                                const union tgsi_exec_channel *src);
2232
2233static void
2234exec_scalar_unary(struct tgsi_exec_machine *mach,
2235                  const struct tgsi_full_instruction *inst,
2236                  micro_unary_op op,
2237                  enum tgsi_exec_datatype dst_datatype,
2238                  enum tgsi_exec_datatype src_datatype)
2239{
2240   unsigned int chan;
2241   union tgsi_exec_channel src;
2242   union tgsi_exec_channel dst;
2243
2244   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
2245   op(&dst, &src);
2246   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2247      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2248         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2249      }
2250   }
2251}
2252
2253static void
2254exec_vector_unary(struct tgsi_exec_machine *mach,
2255                  const struct tgsi_full_instruction *inst,
2256                  micro_unary_op op,
2257                  enum tgsi_exec_datatype dst_datatype,
2258                  enum tgsi_exec_datatype src_datatype)
2259{
2260   unsigned int chan;
2261   struct tgsi_exec_vector dst;
2262
2263   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2264      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2265         union tgsi_exec_channel src;
2266
2267         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2268         op(&dst.xyzw[chan], &src);
2269      }
2270   }
2271   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2272      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2273         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2274      }
2275   }
2276}
2277
2278typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2279                                 const union tgsi_exec_channel *src0,
2280                                 const union tgsi_exec_channel *src1);
2281
2282static void
2283exec_scalar_binary(struct tgsi_exec_machine *mach,
2284                   const struct tgsi_full_instruction *inst,
2285                   micro_binary_op op,
2286                   enum tgsi_exec_datatype dst_datatype,
2287                   enum tgsi_exec_datatype src_datatype)
2288{
2289   unsigned int chan;
2290   union tgsi_exec_channel src[2];
2291   union tgsi_exec_channel dst;
2292
2293   fetch_source(mach, &src[0], &inst->Src[0], CHAN_X, src_datatype);
2294   fetch_source(mach, &src[1], &inst->Src[1], CHAN_Y, src_datatype);
2295   op(&dst, &src[0], &src[1]);
2296   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2297      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2298         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2299      }
2300   }
2301}
2302
2303static void
2304exec_vector_binary(struct tgsi_exec_machine *mach,
2305                   const struct tgsi_full_instruction *inst,
2306                   micro_binary_op op,
2307                   enum tgsi_exec_datatype dst_datatype,
2308                   enum tgsi_exec_datatype src_datatype)
2309{
2310   unsigned int chan;
2311   struct tgsi_exec_vector dst;
2312
2313   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2314      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2315         union tgsi_exec_channel src[2];
2316
2317         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2318         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2319         op(&dst.xyzw[chan], &src[0], &src[1]);
2320      }
2321   }
2322   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2323      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2324         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2325      }
2326   }
2327}
2328
2329typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2330                                  const union tgsi_exec_channel *src0,
2331                                  const union tgsi_exec_channel *src1,
2332                                  const union tgsi_exec_channel *src2);
2333
2334static void
2335exec_vector_trinary(struct tgsi_exec_machine *mach,
2336                    const struct tgsi_full_instruction *inst,
2337                    micro_trinary_op op,
2338                    enum tgsi_exec_datatype dst_datatype,
2339                    enum tgsi_exec_datatype src_datatype)
2340{
2341   unsigned int chan;
2342   struct tgsi_exec_vector dst;
2343
2344   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2345      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2346         union tgsi_exec_channel src[3];
2347
2348         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2349         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2350         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2351         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2352      }
2353   }
2354   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2355      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2356         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2357      }
2358   }
2359}
2360
2361static void
2362exec_dp3(struct tgsi_exec_machine *mach,
2363         const struct tgsi_full_instruction *inst)
2364{
2365   unsigned int chan;
2366   union tgsi_exec_channel arg[3];
2367
2368   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2369   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2370   micro_mul(&arg[2], &arg[0], &arg[1]);
2371
2372   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2373      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2374      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2375      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2376   }
2377
2378   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2379      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2380         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2381      }
2382   }
2383}
2384
2385static void
2386exec_dp4(struct tgsi_exec_machine *mach,
2387         const struct tgsi_full_instruction *inst)
2388{
2389   unsigned int chan;
2390   union tgsi_exec_channel arg[3];
2391
2392   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2393   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2394   micro_mul(&arg[2], &arg[0], &arg[1]);
2395
2396   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2397      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2398      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2399      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2400   }
2401
2402   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2403      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2404         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2405      }
2406   }
2407}
2408
2409static void
2410exec_dp2a(struct tgsi_exec_machine *mach,
2411          const struct tgsi_full_instruction *inst)
2412{
2413   unsigned int chan;
2414   union tgsi_exec_channel arg[3];
2415
2416   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2417   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2418   micro_mul(&arg[2], &arg[0], &arg[1]);
2419
2420   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2421   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2422   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2423
2424   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2425   micro_add(&arg[0], &arg[0], &arg[1]);
2426
2427   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2428      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2429         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2430      }
2431   }
2432}
2433
2434static void
2435exec_dph(struct tgsi_exec_machine *mach,
2436         const struct tgsi_full_instruction *inst)
2437{
2438   unsigned int chan;
2439   union tgsi_exec_channel arg[3];
2440
2441   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2442   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2443   micro_mul(&arg[2], &arg[0], &arg[1]);
2444
2445   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2446   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2447   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2448
2449   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2450   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2451   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2452
2453   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2454   micro_add(&arg[0], &arg[0], &arg[1]);
2455
2456   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2457      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2458         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2459      }
2460   }
2461}
2462
2463static void
2464exec_dp2(struct tgsi_exec_machine *mach,
2465         const struct tgsi_full_instruction *inst)
2466{
2467   unsigned int chan;
2468   union tgsi_exec_channel arg[3];
2469
2470   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2471   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2472   micro_mul(&arg[2], &arg[0], &arg[1]);
2473
2474   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2475   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2476   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2477
2478   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2479      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2480         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2481      }
2482   }
2483}
2484
2485static void
2486exec_nrm4(struct tgsi_exec_machine *mach,
2487          const struct tgsi_full_instruction *inst)
2488{
2489   unsigned int chan;
2490   union tgsi_exec_channel arg[4];
2491   union tgsi_exec_channel scale;
2492
2493   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2494   micro_mul(&scale, &arg[0], &arg[0]);
2495
2496   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2497      union tgsi_exec_channel product;
2498
2499      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2500      micro_mul(&product, &arg[chan], &arg[chan]);
2501      micro_add(&scale, &scale, &product);
2502   }
2503
2504   micro_rsq(&scale, &scale);
2505
2506   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2507      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2508         micro_mul(&arg[chan], &arg[chan], &scale);
2509         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2510      }
2511   }
2512}
2513
2514static void
2515exec_nrm3(struct tgsi_exec_machine *mach,
2516          const struct tgsi_full_instruction *inst)
2517{
2518   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2519      unsigned int chan;
2520      union tgsi_exec_channel arg[3];
2521      union tgsi_exec_channel scale;
2522
2523      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2524      micro_mul(&scale, &arg[0], &arg[0]);
2525
2526      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2527         union tgsi_exec_channel product;
2528
2529         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2530         micro_mul(&product, &arg[chan], &arg[chan]);
2531         micro_add(&scale, &scale, &product);
2532      }
2533
2534      micro_rsq(&scale, &scale);
2535
2536      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2537         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2538            micro_mul(&arg[chan], &arg[chan], &scale);
2539            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2540         }
2541      }
2542   }
2543
2544   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2545      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2546   }
2547}
2548
2549static void
2550exec_scs(struct tgsi_exec_machine *mach,
2551         const struct tgsi_full_instruction *inst)
2552{
2553   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2554      union tgsi_exec_channel arg;
2555      union tgsi_exec_channel result;
2556
2557      fetch_source(mach, &arg, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2558
2559      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2560         micro_cos(&result, &arg);
2561         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2562      }
2563      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2564         micro_sin(&result, &arg);
2565         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2566      }
2567   }
2568   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2569      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2570   }
2571   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2572      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2573   }
2574}
2575
2576static void
2577exec_x2d(struct tgsi_exec_machine *mach,
2578         const struct tgsi_full_instruction *inst)
2579{
2580   union tgsi_exec_channel r[4];
2581   union tgsi_exec_channel d[2];
2582
2583   fetch_source(mach, &r[0], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2584   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2585   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2586      fetch_source(mach, &r[2], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2587      micro_mul(&r[2], &r[2], &r[0]);
2588      fetch_source(mach, &r[3], &inst->Src[2], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2589      micro_mul(&r[3], &r[3], &r[1]);
2590      micro_add(&r[2], &r[2], &r[3]);
2591      fetch_source(mach, &r[3], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2592      micro_add(&d[0], &r[2], &r[3]);
2593   }
2594   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2595      fetch_source(mach, &r[2], &inst->Src[2], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2596      micro_mul(&r[2], &r[2], &r[0]);
2597      fetch_source(mach, &r[3], &inst->Src[2], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2598      micro_mul(&r[3], &r[3], &r[1]);
2599      micro_add(&r[2], &r[2], &r[3]);
2600      fetch_source(mach, &r[3], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2601      micro_add(&d[1], &r[2], &r[3]);
2602   }
2603   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2604      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2605   }
2606   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2607      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2608   }
2609   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2610      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2611   }
2612   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2613      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2614   }
2615}
2616
2617static void
2618exec_rfl(struct tgsi_exec_machine *mach,
2619         const struct tgsi_full_instruction *inst)
2620{
2621   union tgsi_exec_channel r[9];
2622
2623   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2624      /* r0 = dp3(src0, src0) */
2625      fetch_source(mach, &r[2], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2626      micro_mul(&r[0], &r[2], &r[2]);
2627      fetch_source(mach, &r[4], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2628      micro_mul(&r[8], &r[4], &r[4]);
2629      micro_add(&r[0], &r[0], &r[8]);
2630      fetch_source(mach, &r[6], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2631      micro_mul(&r[8], &r[6], &r[6]);
2632      micro_add(&r[0], &r[0], &r[8]);
2633
2634      /* r1 = dp3(src0, src1) */
2635      fetch_source(mach, &r[3], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2636      micro_mul(&r[1], &r[2], &r[3]);
2637      fetch_source(mach, &r[5], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2638      micro_mul(&r[8], &r[4], &r[5]);
2639      micro_add(&r[1], &r[1], &r[8]);
2640      fetch_source(mach, &r[7], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2641      micro_mul(&r[8], &r[6], &r[7]);
2642      micro_add(&r[1], &r[1], &r[8]);
2643
2644      /* r1 = 2 * r1 / r0 */
2645      micro_add(&r[1], &r[1], &r[1]);
2646      micro_div(&r[1], &r[1], &r[0]);
2647
2648      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2649         micro_mul(&r[2], &r[2], &r[1]);
2650         micro_sub(&r[2], &r[2], &r[3]);
2651         store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2652      }
2653      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2654         micro_mul(&r[4], &r[4], &r[1]);
2655         micro_sub(&r[4], &r[4], &r[5]);
2656         store_dest(mach, &r[4], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2657      }
2658      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2659         micro_mul(&r[6], &r[6], &r[1]);
2660         micro_sub(&r[6], &r[6], &r[7]);
2661         store_dest(mach, &r[6], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2662      }
2663   }
2664   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2665      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2666   }
2667}
2668
2669static void
2670exec_xpd(struct tgsi_exec_machine *mach,
2671         const struct tgsi_full_instruction *inst)
2672{
2673   union tgsi_exec_channel r[6];
2674   union tgsi_exec_channel d[3];
2675
2676   fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2677   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2678
2679   micro_mul(&r[2], &r[0], &r[1]);
2680
2681   fetch_source(mach, &r[3], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2682   fetch_source(mach, &r[4], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2683
2684   micro_mul(&r[5], &r[3], &r[4] );
2685   micro_sub(&d[CHAN_X], &r[2], &r[5]);
2686
2687   fetch_source(mach, &r[2], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2688
2689   micro_mul(&r[3], &r[3], &r[2]);
2690
2691   fetch_source(mach, &r[5], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2692
2693   micro_mul(&r[1], &r[1], &r[5]);
2694   micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2695
2696   micro_mul(&r[5], &r[5], &r[4]);
2697   micro_mul(&r[0], &r[0], &r[2]);
2698   micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2699
2700   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2701      store_dest(mach, &d[CHAN_X], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2702   }
2703   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2704      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2705   }
2706   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2707      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2708   }
2709   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2710      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2711   }
2712}
2713
2714static void
2715exec_dst(struct tgsi_exec_machine *mach,
2716         const struct tgsi_full_instruction *inst)
2717{
2718   union tgsi_exec_channel r[2];
2719   union tgsi_exec_channel d[4];
2720
2721   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2722      fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2723      fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2724      micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2725   }
2726   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2727      fetch_source(mach, &d[CHAN_Z], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2728   }
2729   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2730      fetch_source(mach, &d[CHAN_W], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2731   }
2732
2733   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2734      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2735   }
2736   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2737      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2738   }
2739   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2740      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2741   }
2742   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2743      store_dest(mach, &d[CHAN_W], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2744   }
2745}
2746
2747static void
2748exec_log(struct tgsi_exec_machine *mach,
2749         const struct tgsi_full_instruction *inst)
2750{
2751   union tgsi_exec_channel r[3];
2752
2753   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2754   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2755   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2756   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2757   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2758      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2759   }
2760   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2761      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2762      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2763      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2764   }
2765   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2766      store_dest(mach, &r[1], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2767   }
2768   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2769      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2770   }
2771}
2772
2773static void
2774exec_exp(struct tgsi_exec_machine *mach,
2775         const struct tgsi_full_instruction *inst)
2776{
2777   union tgsi_exec_channel r[3];
2778
2779   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2780   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2781   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2782      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2783      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2784   }
2785   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2786      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2787      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2788   }
2789   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2790      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2791      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2792   }
2793   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2794      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2795   }
2796}
2797
2798static void
2799exec_lit(struct tgsi_exec_machine *mach,
2800         const struct tgsi_full_instruction *inst)
2801{
2802   union tgsi_exec_channel r[3];
2803   union tgsi_exec_channel d[3];
2804
2805   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2806      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2807   }
2808   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
2809      fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2810      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2811         micro_max(&d[CHAN_Y], &r[0], &ZeroVec);
2812         store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2813      }
2814
2815      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2816         fetch_source(mach, &r[1], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2817         micro_max(&r[1], &r[1], &ZeroVec);
2818
2819         fetch_source(mach, &r[2], &inst->Src[0], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2820         micro_min(&r[2], &r[2], &P128Vec);
2821         micro_max(&r[2], &r[2], &M128Vec);
2822         micro_pow(&r[1], &r[1], &r[2]);
2823         micro_lt(&d[CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
2824         store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2825      }
2826   }
2827   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2828      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2829   }
2830}
2831
2832static void
2833exec_break(struct tgsi_exec_machine *mach)
2834{
2835   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2836      /* turn off loop channels for each enabled exec channel */
2837      mach->LoopMask &= ~mach->ExecMask;
2838      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2839      UPDATE_EXEC_MASK(mach);
2840   } else {
2841      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2842
2843      mach->Switch.mask = 0x0;
2844
2845      UPDATE_EXEC_MASK(mach);
2846   }
2847}
2848
2849static void
2850exec_switch(struct tgsi_exec_machine *mach,
2851            const struct tgsi_full_instruction *inst)
2852{
2853   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2854   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2855
2856   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2857   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2858   mach->Switch.mask = 0x0;
2859   mach->Switch.defaultMask = 0x0;
2860
2861   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2862   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2863
2864   UPDATE_EXEC_MASK(mach);
2865}
2866
2867static void
2868exec_case(struct tgsi_exec_machine *mach,
2869          const struct tgsi_full_instruction *inst)
2870{
2871   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2872   union tgsi_exec_channel src;
2873   uint mask = 0;
2874
2875   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2876
2877   if (mach->Switch.selector.u[0] == src.u[0]) {
2878      mask |= 0x1;
2879   }
2880   if (mach->Switch.selector.u[1] == src.u[1]) {
2881      mask |= 0x2;
2882   }
2883   if (mach->Switch.selector.u[2] == src.u[2]) {
2884      mask |= 0x4;
2885   }
2886   if (mach->Switch.selector.u[3] == src.u[3]) {
2887      mask |= 0x8;
2888   }
2889
2890   mach->Switch.defaultMask |= mask;
2891
2892   mach->Switch.mask |= mask & prevMask;
2893
2894   UPDATE_EXEC_MASK(mach);
2895}
2896
2897static void
2898exec_default(struct tgsi_exec_machine *mach)
2899{
2900   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2901
2902   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2903
2904   UPDATE_EXEC_MASK(mach);
2905}
2906
2907static void
2908exec_endswitch(struct tgsi_exec_machine *mach)
2909{
2910   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2911   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2912
2913   UPDATE_EXEC_MASK(mach);
2914}
2915
2916static void
2917micro_i2f(union tgsi_exec_channel *dst,
2918          const union tgsi_exec_channel *src)
2919{
2920   dst->f[0] = (float)src->i[0];
2921   dst->f[1] = (float)src->i[1];
2922   dst->f[2] = (float)src->i[2];
2923   dst->f[3] = (float)src->i[3];
2924}
2925
2926static void
2927micro_not(union tgsi_exec_channel *dst,
2928          const union tgsi_exec_channel *src)
2929{
2930   dst->u[0] = ~src->u[0];
2931   dst->u[1] = ~src->u[1];
2932   dst->u[2] = ~src->u[2];
2933   dst->u[3] = ~src->u[3];
2934}
2935
2936static void
2937micro_shl(union tgsi_exec_channel *dst,
2938          const union tgsi_exec_channel *src0,
2939          const union tgsi_exec_channel *src1)
2940{
2941   dst->u[0] = src0->u[0] << src1->u[0];
2942   dst->u[1] = src0->u[1] << src1->u[1];
2943   dst->u[2] = src0->u[2] << src1->u[2];
2944   dst->u[3] = src0->u[3] << src1->u[3];
2945}
2946
2947static void
2948micro_and(union tgsi_exec_channel *dst,
2949          const union tgsi_exec_channel *src0,
2950          const union tgsi_exec_channel *src1)
2951{
2952   dst->u[0] = src0->u[0] & src1->u[0];
2953   dst->u[1] = src0->u[1] & src1->u[1];
2954   dst->u[2] = src0->u[2] & src1->u[2];
2955   dst->u[3] = src0->u[3] & src1->u[3];
2956}
2957
2958static void
2959micro_or(union tgsi_exec_channel *dst,
2960         const union tgsi_exec_channel *src0,
2961         const union tgsi_exec_channel *src1)
2962{
2963   dst->u[0] = src0->u[0] | src1->u[0];
2964   dst->u[1] = src0->u[1] | src1->u[1];
2965   dst->u[2] = src0->u[2] | src1->u[2];
2966   dst->u[3] = src0->u[3] | src1->u[3];
2967}
2968
2969static void
2970micro_xor(union tgsi_exec_channel *dst,
2971          const union tgsi_exec_channel *src0,
2972          const union tgsi_exec_channel *src1)
2973{
2974   dst->u[0] = src0->u[0] ^ src1->u[0];
2975   dst->u[1] = src0->u[1] ^ src1->u[1];
2976   dst->u[2] = src0->u[2] ^ src1->u[2];
2977   dst->u[3] = src0->u[3] ^ src1->u[3];
2978}
2979
2980static void
2981micro_f2i(union tgsi_exec_channel *dst,
2982          const union tgsi_exec_channel *src)
2983{
2984   dst->i[0] = (int)src->f[0];
2985   dst->i[1] = (int)src->f[1];
2986   dst->i[2] = (int)src->f[2];
2987   dst->i[3] = (int)src->f[3];
2988}
2989
2990static void
2991micro_idiv(union tgsi_exec_channel *dst,
2992           const union tgsi_exec_channel *src0,
2993           const union tgsi_exec_channel *src1)
2994{
2995   dst->i[0] = src0->i[0] / src1->i[0];
2996   dst->i[1] = src0->i[1] / src1->i[1];
2997   dst->i[2] = src0->i[2] / src1->i[2];
2998   dst->i[3] = src0->i[3] / src1->i[3];
2999}
3000
3001static void
3002micro_imax(union tgsi_exec_channel *dst,
3003           const union tgsi_exec_channel *src0,
3004           const union tgsi_exec_channel *src1)
3005{
3006   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
3007   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
3008   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
3009   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
3010}
3011
3012static void
3013micro_imin(union tgsi_exec_channel *dst,
3014           const union tgsi_exec_channel *src0,
3015           const union tgsi_exec_channel *src1)
3016{
3017   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
3018   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
3019   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
3020   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
3021}
3022
3023static void
3024micro_isge(union tgsi_exec_channel *dst,
3025           const union tgsi_exec_channel *src0,
3026           const union tgsi_exec_channel *src1)
3027{
3028   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
3029   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
3030   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
3031   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
3032}
3033
3034static void
3035micro_ishr(union tgsi_exec_channel *dst,
3036           const union tgsi_exec_channel *src0,
3037           const union tgsi_exec_channel *src1)
3038{
3039   dst->i[0] = src0->i[0] >> src1->i[0];
3040   dst->i[1] = src0->i[1] >> src1->i[1];
3041   dst->i[2] = src0->i[2] >> src1->i[2];
3042   dst->i[3] = src0->i[3] >> src1->i[3];
3043}
3044
3045static void
3046micro_islt(union tgsi_exec_channel *dst,
3047           const union tgsi_exec_channel *src0,
3048           const union tgsi_exec_channel *src1)
3049{
3050   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3051   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3052   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3053   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3054}
3055
3056static void
3057micro_f2u(union tgsi_exec_channel *dst,
3058          const union tgsi_exec_channel *src)
3059{
3060   dst->u[0] = (uint)src->f[0];
3061   dst->u[1] = (uint)src->f[1];
3062   dst->u[2] = (uint)src->f[2];
3063   dst->u[3] = (uint)src->f[3];
3064}
3065
3066static void
3067micro_u2f(union tgsi_exec_channel *dst,
3068          const union tgsi_exec_channel *src)
3069{
3070   dst->f[0] = (float)src->u[0];
3071   dst->f[1] = (float)src->u[1];
3072   dst->f[2] = (float)src->u[2];
3073   dst->f[3] = (float)src->u[3];
3074}
3075
3076static void
3077micro_uadd(union tgsi_exec_channel *dst,
3078           const union tgsi_exec_channel *src0,
3079           const union tgsi_exec_channel *src1)
3080{
3081   dst->u[0] = src0->u[0] + src1->u[0];
3082   dst->u[1] = src0->u[1] + src1->u[1];
3083   dst->u[2] = src0->u[2] + src1->u[2];
3084   dst->u[3] = src0->u[3] + src1->u[3];
3085}
3086
3087static void
3088micro_udiv(union tgsi_exec_channel *dst,
3089           const union tgsi_exec_channel *src0,
3090           const union tgsi_exec_channel *src1)
3091{
3092   dst->u[0] = src0->u[0] / src1->u[0];
3093   dst->u[1] = src0->u[1] / src1->u[1];
3094   dst->u[2] = src0->u[2] / src1->u[2];
3095   dst->u[3] = src0->u[3] / src1->u[3];
3096}
3097
3098static void
3099micro_umad(union tgsi_exec_channel *dst,
3100           const union tgsi_exec_channel *src0,
3101           const union tgsi_exec_channel *src1,
3102           const union tgsi_exec_channel *src2)
3103{
3104   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3105   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3106   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3107   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3108}
3109
3110static void
3111micro_umax(union tgsi_exec_channel *dst,
3112           const union tgsi_exec_channel *src0,
3113           const union tgsi_exec_channel *src1)
3114{
3115   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3116   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3117   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3118   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3119}
3120
3121static void
3122micro_umin(union tgsi_exec_channel *dst,
3123           const union tgsi_exec_channel *src0,
3124           const union tgsi_exec_channel *src1)
3125{
3126   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3127   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3128   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3129   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3130}
3131
3132static void
3133micro_umod(union tgsi_exec_channel *dst,
3134           const union tgsi_exec_channel *src0,
3135           const union tgsi_exec_channel *src1)
3136{
3137   dst->u[0] = src0->u[0] % src1->u[0];
3138   dst->u[1] = src0->u[1] % src1->u[1];
3139   dst->u[2] = src0->u[2] % src1->u[2];
3140   dst->u[3] = src0->u[3] % src1->u[3];
3141}
3142
3143static void
3144micro_umul(union tgsi_exec_channel *dst,
3145           const union tgsi_exec_channel *src0,
3146           const union tgsi_exec_channel *src1)
3147{
3148   dst->u[0] = src0->u[0] * src1->u[0];
3149   dst->u[1] = src0->u[1] * src1->u[1];
3150   dst->u[2] = src0->u[2] * src1->u[2];
3151   dst->u[3] = src0->u[3] * src1->u[3];
3152}
3153
3154static void
3155micro_useq(union tgsi_exec_channel *dst,
3156           const union tgsi_exec_channel *src0,
3157           const union tgsi_exec_channel *src1)
3158{
3159   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3160   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3161   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3162   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3163}
3164
3165static void
3166micro_usge(union tgsi_exec_channel *dst,
3167           const union tgsi_exec_channel *src0,
3168           const union tgsi_exec_channel *src1)
3169{
3170   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3171   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3172   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3173   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3174}
3175
3176static void
3177micro_ushr(union tgsi_exec_channel *dst,
3178           const union tgsi_exec_channel *src0,
3179           const union tgsi_exec_channel *src1)
3180{
3181   dst->u[0] = src0->u[0] >> src1->u[0];
3182   dst->u[1] = src0->u[1] >> src1->u[1];
3183   dst->u[2] = src0->u[2] >> src1->u[2];
3184   dst->u[3] = src0->u[3] >> src1->u[3];
3185}
3186
3187static void
3188micro_uslt(union tgsi_exec_channel *dst,
3189           const union tgsi_exec_channel *src0,
3190           const union tgsi_exec_channel *src1)
3191{
3192   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3193   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3194   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3195   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3196}
3197
3198static void
3199micro_usne(union tgsi_exec_channel *dst,
3200           const union tgsi_exec_channel *src0,
3201           const union tgsi_exec_channel *src1)
3202{
3203   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3204   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3205   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3206   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3207}
3208
3209static void
3210exec_instruction(
3211   struct tgsi_exec_machine *mach,
3212   const struct tgsi_full_instruction *inst,
3213   int *pc )
3214{
3215   union tgsi_exec_channel r[10];
3216
3217   (*pc)++;
3218
3219   switch (inst->Instruction.Opcode) {
3220   case TGSI_OPCODE_ARL:
3221      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3222      break;
3223
3224   case TGSI_OPCODE_MOV:
3225      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3226      break;
3227
3228   case TGSI_OPCODE_LIT:
3229      exec_lit(mach, inst);
3230      break;
3231
3232   case TGSI_OPCODE_RCP:
3233      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3234      break;
3235
3236   case TGSI_OPCODE_RSQ:
3237      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3238      break;
3239
3240   case TGSI_OPCODE_EXP:
3241      exec_exp(mach, inst);
3242      break;
3243
3244   case TGSI_OPCODE_LOG:
3245      exec_log(mach, inst);
3246      break;
3247
3248   case TGSI_OPCODE_MUL:
3249      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3250      break;
3251
3252   case TGSI_OPCODE_ADD:
3253      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3254      break;
3255
3256   case TGSI_OPCODE_DP3:
3257      exec_dp3(mach, inst);
3258      break;
3259
3260   case TGSI_OPCODE_DP4:
3261      exec_dp4(mach, inst);
3262      break;
3263
3264   case TGSI_OPCODE_DST:
3265      exec_dst(mach, inst);
3266      break;
3267
3268   case TGSI_OPCODE_MIN:
3269      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3270      break;
3271
3272   case TGSI_OPCODE_MAX:
3273      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3274      break;
3275
3276   case TGSI_OPCODE_SLT:
3277      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3278      break;
3279
3280   case TGSI_OPCODE_SGE:
3281      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3282      break;
3283
3284   case TGSI_OPCODE_MAD:
3285      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3286      break;
3287
3288   case TGSI_OPCODE_SUB:
3289      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3290      break;
3291
3292   case TGSI_OPCODE_LRP:
3293      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3294      break;
3295
3296   case TGSI_OPCODE_CND:
3297      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3298      break;
3299
3300   case TGSI_OPCODE_DP2A:
3301      exec_dp2a(mach, inst);
3302      break;
3303
3304   case TGSI_OPCODE_FRC:
3305      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3306      break;
3307
3308   case TGSI_OPCODE_CLAMP:
3309      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3310      break;
3311
3312   case TGSI_OPCODE_FLR:
3313      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3314      break;
3315
3316   case TGSI_OPCODE_ROUND:
3317      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3318      break;
3319
3320   case TGSI_OPCODE_EX2:
3321      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3322      break;
3323
3324   case TGSI_OPCODE_LG2:
3325      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3326      break;
3327
3328   case TGSI_OPCODE_POW:
3329      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3330      break;
3331
3332   case TGSI_OPCODE_XPD:
3333      exec_xpd(mach, inst);
3334      break;
3335
3336   case TGSI_OPCODE_ABS:
3337      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3338      break;
3339
3340   case TGSI_OPCODE_RCC:
3341      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3342      break;
3343
3344   case TGSI_OPCODE_DPH:
3345      exec_dph(mach, inst);
3346      break;
3347
3348   case TGSI_OPCODE_COS:
3349      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3350      break;
3351
3352   case TGSI_OPCODE_DDX:
3353      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3354      break;
3355
3356   case TGSI_OPCODE_DDY:
3357      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3358      break;
3359
3360   case TGSI_OPCODE_KILP:
3361      exec_kilp (mach, inst);
3362      break;
3363
3364   case TGSI_OPCODE_KIL:
3365      exec_kil (mach, inst);
3366      break;
3367
3368   case TGSI_OPCODE_PK2H:
3369      assert (0);
3370      break;
3371
3372   case TGSI_OPCODE_PK2US:
3373      assert (0);
3374      break;
3375
3376   case TGSI_OPCODE_PK4B:
3377      assert (0);
3378      break;
3379
3380   case TGSI_OPCODE_PK4UB:
3381      assert (0);
3382      break;
3383
3384   case TGSI_OPCODE_RFL:
3385      exec_rfl(mach, inst);
3386      break;
3387
3388   case TGSI_OPCODE_SEQ:
3389      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3390      break;
3391
3392   case TGSI_OPCODE_SFL:
3393      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3394      break;
3395
3396   case TGSI_OPCODE_SGT:
3397      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3398      break;
3399
3400   case TGSI_OPCODE_SIN:
3401      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3402      break;
3403
3404   case TGSI_OPCODE_SLE:
3405      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3406      break;
3407
3408   case TGSI_OPCODE_SNE:
3409      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3410      break;
3411
3412   case TGSI_OPCODE_STR:
3413      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3414      break;
3415
3416   case TGSI_OPCODE_TEX:
3417      /* simple texture lookup */
3418      /* src[0] = texcoord */
3419      /* src[1] = sampler unit */
3420      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3421      break;
3422
3423   case TGSI_OPCODE_TXB:
3424      /* Texture lookup with lod bias */
3425      /* src[0] = texcoord (src[0].w = LOD bias) */
3426      /* src[1] = sampler unit */
3427      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3428      break;
3429
3430   case TGSI_OPCODE_TXD:
3431      /* Texture lookup with explict partial derivatives */
3432      /* src[0] = texcoord */
3433      /* src[1] = d[strq]/dx */
3434      /* src[2] = d[strq]/dy */
3435      /* src[3] = sampler unit */
3436      exec_txd(mach, inst);
3437      break;
3438
3439   case TGSI_OPCODE_TXL:
3440      /* Texture lookup with explit LOD */
3441      /* src[0] = texcoord (src[0].w = LOD) */
3442      /* src[1] = sampler unit */
3443      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3444      break;
3445
3446   case TGSI_OPCODE_TXP:
3447      /* Texture lookup with projection */
3448      /* src[0] = texcoord (src[0].w = projection) */
3449      /* src[1] = sampler unit */
3450      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3451      break;
3452
3453   case TGSI_OPCODE_UP2H:
3454      assert (0);
3455      break;
3456
3457   case TGSI_OPCODE_UP2US:
3458      assert (0);
3459      break;
3460
3461   case TGSI_OPCODE_UP4B:
3462      assert (0);
3463      break;
3464
3465   case TGSI_OPCODE_UP4UB:
3466      assert (0);
3467      break;
3468
3469   case TGSI_OPCODE_X2D:
3470      exec_x2d(mach, inst);
3471      break;
3472
3473   case TGSI_OPCODE_ARA:
3474      assert (0);
3475      break;
3476
3477   case TGSI_OPCODE_ARR:
3478      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3479      break;
3480
3481   case TGSI_OPCODE_BRA:
3482      assert (0);
3483      break;
3484
3485   case TGSI_OPCODE_CAL:
3486      /* skip the call if no execution channels are enabled */
3487      if (mach->ExecMask) {
3488         /* do the call */
3489
3490         /* First, record the depths of the execution stacks.
3491          * This is important for deeply nested/looped return statements.
3492          * We have to unwind the stacks by the correct amount.  For a
3493          * real code generator, we could determine the number of entries
3494          * to pop off each stack with simple static analysis and avoid
3495          * implementing this data structure at run time.
3496          */
3497         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3498         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3499         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3500         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3501         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3502         /* note that PC was already incremented above */
3503         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3504
3505         mach->CallStackTop++;
3506
3507         /* Second, push the Cond, Loop, Cont, Func stacks */
3508         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3509         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3510         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3511         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3512         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3513         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3514
3515         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3516         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3517         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3518         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3519         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3520         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3521
3522         /* Finally, jump to the subroutine */
3523         *pc = inst->Label.Label;
3524      }
3525      break;
3526
3527   case TGSI_OPCODE_RET:
3528      mach->FuncMask &= ~mach->ExecMask;
3529      UPDATE_EXEC_MASK(mach);
3530
3531      if (mach->FuncMask == 0x0) {
3532         /* really return now (otherwise, keep executing */
3533
3534         if (mach->CallStackTop == 0) {
3535            /* returning from main() */
3536            mach->CondStackTop = 0;
3537            mach->LoopStackTop = 0;
3538            *pc = -1;
3539            return;
3540         }
3541
3542         assert(mach->CallStackTop > 0);
3543         mach->CallStackTop--;
3544
3545         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3546         mach->CondMask = mach->CondStack[mach->CondStackTop];
3547
3548         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3549         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3550
3551         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3552         mach->ContMask = mach->ContStack[mach->ContStackTop];
3553
3554         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3555         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3556
3557         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3558         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3559
3560         assert(mach->FuncStackTop > 0);
3561         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3562
3563         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3564
3565         UPDATE_EXEC_MASK(mach);
3566      }
3567      break;
3568
3569   case TGSI_OPCODE_SSG:
3570      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3571      break;
3572
3573   case TGSI_OPCODE_CMP:
3574      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3575      break;
3576
3577   case TGSI_OPCODE_SCS:
3578      exec_scs(mach, inst);
3579      break;
3580
3581   case TGSI_OPCODE_NRM:
3582      exec_nrm3(mach, inst);
3583      break;
3584
3585   case TGSI_OPCODE_NRM4:
3586      exec_nrm4(mach, inst);
3587      break;
3588
3589   case TGSI_OPCODE_DIV:
3590      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3591      break;
3592
3593   case TGSI_OPCODE_DP2:
3594      exec_dp2(mach, inst);
3595      break;
3596
3597   case TGSI_OPCODE_IF:
3598      /* push CondMask */
3599      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3600      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3601      FETCH( &r[0], 0, CHAN_X );
3602      /* update CondMask */
3603      if( ! r[0].u[0] ) {
3604         mach->CondMask &= ~0x1;
3605      }
3606      if( ! r[0].u[1] ) {
3607         mach->CondMask &= ~0x2;
3608      }
3609      if( ! r[0].u[2] ) {
3610         mach->CondMask &= ~0x4;
3611      }
3612      if( ! r[0].u[3] ) {
3613         mach->CondMask &= ~0x8;
3614      }
3615      UPDATE_EXEC_MASK(mach);
3616      /* Todo: If CondMask==0, jump to ELSE */
3617      break;
3618
3619   case TGSI_OPCODE_ELSE:
3620      /* invert CondMask wrt previous mask */
3621      {
3622         uint prevMask;
3623         assert(mach->CondStackTop > 0);
3624         prevMask = mach->CondStack[mach->CondStackTop - 1];
3625         mach->CondMask = ~mach->CondMask & prevMask;
3626         UPDATE_EXEC_MASK(mach);
3627         /* Todo: If CondMask==0, jump to ENDIF */
3628      }
3629      break;
3630
3631   case TGSI_OPCODE_ENDIF:
3632      /* pop CondMask */
3633      assert(mach->CondStackTop > 0);
3634      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3635      UPDATE_EXEC_MASK(mach);
3636      break;
3637
3638   case TGSI_OPCODE_END:
3639      /* make sure we end primitives which haven't
3640       * been explicitly emitted */
3641      conditional_emit_primitive(mach);
3642      /* halt execution */
3643      *pc = -1;
3644      break;
3645
3646   case TGSI_OPCODE_PUSHA:
3647      assert (0);
3648      break;
3649
3650   case TGSI_OPCODE_POPA:
3651      assert (0);
3652      break;
3653
3654   case TGSI_OPCODE_CEIL:
3655      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3656      break;
3657
3658   case TGSI_OPCODE_I2F:
3659      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3660      break;
3661
3662   case TGSI_OPCODE_NOT:
3663      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3664      break;
3665
3666   case TGSI_OPCODE_TRUNC:
3667      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3668      break;
3669
3670   case TGSI_OPCODE_SHL:
3671      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3672      break;
3673
3674   case TGSI_OPCODE_AND:
3675      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3676      break;
3677
3678   case TGSI_OPCODE_OR:
3679      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3680      break;
3681
3682   case TGSI_OPCODE_MOD:
3683      assert (0);
3684      break;
3685
3686   case TGSI_OPCODE_XOR:
3687      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3688      break;
3689
3690   case TGSI_OPCODE_SAD:
3691      assert (0);
3692      break;
3693
3694   case TGSI_OPCODE_TXF:
3695      assert (0);
3696      break;
3697
3698   case TGSI_OPCODE_TXQ:
3699      assert (0);
3700      break;
3701
3702   case TGSI_OPCODE_EMIT:
3703      emit_vertex(mach);
3704      break;
3705
3706   case TGSI_OPCODE_ENDPRIM:
3707      emit_primitive(mach);
3708      break;
3709
3710   case TGSI_OPCODE_BGNLOOP:
3711      /* push LoopMask and ContMasks */
3712      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3713      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3714      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3715      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3716
3717      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3718      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3719      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3720      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3721      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3722      break;
3723
3724   case TGSI_OPCODE_ENDLOOP:
3725      /* Restore ContMask, but don't pop */
3726      assert(mach->ContStackTop > 0);
3727      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3728      UPDATE_EXEC_MASK(mach);
3729      if (mach->ExecMask) {
3730         /* repeat loop: jump to instruction just past BGNLOOP */
3731         assert(mach->LoopLabelStackTop > 0);
3732         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3733      }
3734      else {
3735         /* exit loop: pop LoopMask */
3736         assert(mach->LoopStackTop > 0);
3737         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3738         /* pop ContMask */
3739         assert(mach->ContStackTop > 0);
3740         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3741         assert(mach->LoopLabelStackTop > 0);
3742         --mach->LoopLabelStackTop;
3743
3744         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3745      }
3746      UPDATE_EXEC_MASK(mach);
3747      break;
3748
3749   case TGSI_OPCODE_BRK:
3750      exec_break(mach);
3751      break;
3752
3753   case TGSI_OPCODE_CONT:
3754      /* turn off cont channels for each enabled exec channel */
3755      mach->ContMask &= ~mach->ExecMask;
3756      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3757      UPDATE_EXEC_MASK(mach);
3758      break;
3759
3760   case TGSI_OPCODE_BGNSUB:
3761      /* no-op */
3762      break;
3763
3764   case TGSI_OPCODE_ENDSUB:
3765      /*
3766       * XXX: This really should be a no-op. We should never reach this opcode.
3767       */
3768
3769      assert(mach->CallStackTop > 0);
3770      mach->CallStackTop--;
3771
3772      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3773      mach->CondMask = mach->CondStack[mach->CondStackTop];
3774
3775      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3776      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3777
3778      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3779      mach->ContMask = mach->ContStack[mach->ContStackTop];
3780
3781      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3782      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3783
3784      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3785      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3786
3787      assert(mach->FuncStackTop > 0);
3788      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3789
3790      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3791
3792      UPDATE_EXEC_MASK(mach);
3793      break;
3794
3795   case TGSI_OPCODE_NOP:
3796      break;
3797
3798   case TGSI_OPCODE_BREAKC:
3799      FETCH(&r[0], 0, CHAN_X);
3800      /* update CondMask */
3801      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3802         mach->LoopMask &= ~0x1;
3803      }
3804      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3805         mach->LoopMask &= ~0x2;
3806      }
3807      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3808         mach->LoopMask &= ~0x4;
3809      }
3810      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3811         mach->LoopMask &= ~0x8;
3812      }
3813      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3814      UPDATE_EXEC_MASK(mach);
3815      break;
3816
3817   case TGSI_OPCODE_F2I:
3818      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3819      break;
3820
3821   case TGSI_OPCODE_IDIV:
3822      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3823      break;
3824
3825   case TGSI_OPCODE_IMAX:
3826      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3827      break;
3828
3829   case TGSI_OPCODE_IMIN:
3830      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3831      break;
3832
3833   case TGSI_OPCODE_INEG:
3834      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3835      break;
3836
3837   case TGSI_OPCODE_ISGE:
3838      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3839      break;
3840
3841   case TGSI_OPCODE_ISHR:
3842      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3843      break;
3844
3845   case TGSI_OPCODE_ISLT:
3846      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3847      break;
3848
3849   case TGSI_OPCODE_F2U:
3850      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3851      break;
3852
3853   case TGSI_OPCODE_U2F:
3854      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3855      break;
3856
3857   case TGSI_OPCODE_UADD:
3858      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3859      break;
3860
3861   case TGSI_OPCODE_UDIV:
3862      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3863      break;
3864
3865   case TGSI_OPCODE_UMAD:
3866      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3867      break;
3868
3869   case TGSI_OPCODE_UMAX:
3870      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3871      break;
3872
3873   case TGSI_OPCODE_UMIN:
3874      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3875      break;
3876
3877   case TGSI_OPCODE_UMOD:
3878      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3879      break;
3880
3881   case TGSI_OPCODE_UMUL:
3882      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3883      break;
3884
3885   case TGSI_OPCODE_USEQ:
3886      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3887      break;
3888
3889   case TGSI_OPCODE_USGE:
3890      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3891      break;
3892
3893   case TGSI_OPCODE_USHR:
3894      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3895      break;
3896
3897   case TGSI_OPCODE_USLT:
3898      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3899      break;
3900
3901   case TGSI_OPCODE_USNE:
3902      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3903      break;
3904
3905   case TGSI_OPCODE_SWITCH:
3906      exec_switch(mach, inst);
3907      break;
3908
3909   case TGSI_OPCODE_CASE:
3910      exec_case(mach, inst);
3911      break;
3912
3913   case TGSI_OPCODE_DEFAULT:
3914      exec_default(mach);
3915      break;
3916
3917   case TGSI_OPCODE_ENDSWITCH:
3918      exec_endswitch(mach);
3919      break;
3920
3921   case TGSI_OPCODE_LOAD:
3922      assert(0);
3923      break;
3924
3925   case TGSI_OPCODE_LOAD_MS:
3926      assert(0);
3927      break;
3928
3929   case TGSI_OPCODE_SAMPLE:
3930      exec_sample(mach, inst, TEX_MODIFIER_NONE);
3931      break;
3932
3933   case TGSI_OPCODE_SAMPLE_B:
3934      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
3935      break;
3936
3937   case TGSI_OPCODE_SAMPLE_C:
3938      exec_sample(mach, inst, TEX_MODIFIER_NONE);
3939      break;
3940
3941   case TGSI_OPCODE_SAMPLE_C_LZ:
3942      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
3943      break;
3944
3945   case TGSI_OPCODE_SAMPLE_D:
3946      exec_sample_d(mach, inst);
3947      break;
3948
3949   case TGSI_OPCODE_SAMPLE_L:
3950      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3951      break;
3952
3953   case TGSI_OPCODE_GATHER4:
3954      assert(0);
3955      break;
3956
3957   case TGSI_OPCODE_RESINFO:
3958      assert(0);
3959      break;
3960
3961   case TGSI_OPCODE_SAMPLE_POS:
3962      assert(0);
3963      break;
3964
3965   case TGSI_OPCODE_SAMPLE_INFO:
3966      assert(0);
3967      break;
3968
3969   default:
3970      assert( 0 );
3971   }
3972}
3973
3974
3975#define DEBUG_EXECUTION 0
3976
3977
3978/**
3979 * Run TGSI interpreter.
3980 * \return bitmask of "alive" quad components
3981 */
3982uint
3983tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3984{
3985   uint i;
3986   int pc = 0;
3987
3988   mach->CondMask = 0xf;
3989   mach->LoopMask = 0xf;
3990   mach->ContMask = 0xf;
3991   mach->FuncMask = 0xf;
3992   mach->ExecMask = 0xf;
3993
3994   mach->Switch.mask = 0xf;
3995
3996   assert(mach->CondStackTop == 0);
3997   assert(mach->LoopStackTop == 0);
3998   assert(mach->ContStackTop == 0);
3999   assert(mach->SwitchStackTop == 0);
4000   assert(mach->BreakStackTop == 0);
4001   assert(mach->CallStackTop == 0);
4002
4003   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
4004   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
4005
4006   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
4007      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
4008      mach->Primitives[0] = 0;
4009   }
4010
4011   /* execute declarations (interpolants) */
4012   for (i = 0; i < mach->NumDeclarations; i++) {
4013      exec_declaration( mach, mach->Declarations+i );
4014   }
4015
4016   {
4017#if DEBUG_EXECUTION
4018      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
4019      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
4020      uint inst = 1;
4021
4022      memcpy(temps, mach->Temps, sizeof(temps));
4023      memcpy(outputs, mach->Outputs, sizeof(outputs));
4024#endif
4025
4026      /* execute instructions, until pc is set to -1 */
4027      while (pc != -1) {
4028
4029#if DEBUG_EXECUTION
4030         uint i;
4031
4032         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
4033#endif
4034
4035         assert(pc < (int) mach->NumInstructions);
4036         exec_instruction(mach, mach->Instructions + pc, &pc);
4037
4038#if DEBUG_EXECUTION
4039         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4040            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4041               uint j;
4042
4043               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4044               debug_printf("TEMP[%2u] = ", i);
4045               for (j = 0; j < 4; j++) {
4046                  if (j > 0) {
4047                     debug_printf("           ");
4048                  }
4049                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4050                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4051                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4052                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4053                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4054               }
4055            }
4056         }
4057         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4058            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4059               uint j;
4060
4061               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4062               debug_printf("OUT[%2u] =  ", i);
4063               for (j = 0; j < 4; j++) {
4064                  if (j > 0) {
4065                     debug_printf("           ");
4066                  }
4067                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4068                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4069                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4070                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4071                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4072               }
4073            }
4074         }
4075#endif
4076      }
4077   }
4078
4079#if 0
4080   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4081   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4082      /*
4083       * Scale back depth component.
4084       */
4085      for (i = 0; i < 4; i++)
4086         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4087   }
4088#endif
4089
4090   /* Strictly speaking, these assertions aren't really needed but they
4091    * can potentially catch some bugs in the control flow code.
4092    */
4093   assert(mach->CondStackTop == 0);
4094   assert(mach->LoopStackTop == 0);
4095   assert(mach->ContStackTop == 0);
4096   assert(mach->SwitchStackTop == 0);
4097   assert(mach->BreakStackTop == 0);
4098   assert(mach->CallStackTop == 0);
4099
4100   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4101}
4102