1/**************************************************************************
2 *
3 * Copyright 2007-2008 VMware, Inc.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_half.h"
62#include "util/u_memory.h"
63#include "util/u_math.h"
64#include "util/rounding.h"
65
66
67#define DEBUG_EXECUTION 0
68
69
70#define FAST_MATH 0
71
72#define TILE_TOP_LEFT     0
73#define TILE_TOP_RIGHT    1
74#define TILE_BOTTOM_LEFT  2
75#define TILE_BOTTOM_RIGHT 3
76
77union tgsi_double_channel {
78   double d[TGSI_QUAD_SIZE];
79   unsigned u[TGSI_QUAD_SIZE][2];
80   uint64_t u64[TGSI_QUAD_SIZE];
81   int64_t i64[TGSI_QUAD_SIZE];
82};
83
84struct tgsi_double_vector {
85   union tgsi_double_channel xy;
86   union tgsi_double_channel zw;
87};
88
89static void
90micro_abs(union tgsi_exec_channel *dst,
91          const union tgsi_exec_channel *src)
92{
93   dst->f[0] = fabsf(src->f[0]);
94   dst->f[1] = fabsf(src->f[1]);
95   dst->f[2] = fabsf(src->f[2]);
96   dst->f[3] = fabsf(src->f[3]);
97}
98
99static void
100micro_arl(union tgsi_exec_channel *dst,
101          const union tgsi_exec_channel *src)
102{
103   dst->i[0] = (int)floorf(src->f[0]);
104   dst->i[1] = (int)floorf(src->f[1]);
105   dst->i[2] = (int)floorf(src->f[2]);
106   dst->i[3] = (int)floorf(src->f[3]);
107}
108
109static void
110micro_arr(union tgsi_exec_channel *dst,
111          const union tgsi_exec_channel *src)
112{
113   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
114   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
115   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
116   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
117}
118
119static void
120micro_ceil(union tgsi_exec_channel *dst,
121           const union tgsi_exec_channel *src)
122{
123   dst->f[0] = ceilf(src->f[0]);
124   dst->f[1] = ceilf(src->f[1]);
125   dst->f[2] = ceilf(src->f[2]);
126   dst->f[3] = ceilf(src->f[3]);
127}
128
129static void
130micro_clamp(union tgsi_exec_channel *dst,
131            const union tgsi_exec_channel *src0,
132            const union tgsi_exec_channel *src1,
133            const union tgsi_exec_channel *src2)
134{
135   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
136   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
137   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
138   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
139}
140
141static void
142micro_cmp(union tgsi_exec_channel *dst,
143          const union tgsi_exec_channel *src0,
144          const union tgsi_exec_channel *src1,
145          const union tgsi_exec_channel *src2)
146{
147   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
148   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
149   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
150   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
151}
152
153static void
154micro_cos(union tgsi_exec_channel *dst,
155          const union tgsi_exec_channel *src)
156{
157   dst->f[0] = cosf(src->f[0]);
158   dst->f[1] = cosf(src->f[1]);
159   dst->f[2] = cosf(src->f[2]);
160   dst->f[3] = cosf(src->f[3]);
161}
162
163static void
164micro_d2f(union tgsi_exec_channel *dst,
165          const union tgsi_double_channel *src)
166{
167   dst->f[0] = (float)src->d[0];
168   dst->f[1] = (float)src->d[1];
169   dst->f[2] = (float)src->d[2];
170   dst->f[3] = (float)src->d[3];
171}
172
173static void
174micro_d2i(union tgsi_exec_channel *dst,
175          const union tgsi_double_channel *src)
176{
177   dst->i[0] = (int)src->d[0];
178   dst->i[1] = (int)src->d[1];
179   dst->i[2] = (int)src->d[2];
180   dst->i[3] = (int)src->d[3];
181}
182
183static void
184micro_d2u(union tgsi_exec_channel *dst,
185          const union tgsi_double_channel *src)
186{
187   dst->u[0] = (unsigned)src->d[0];
188   dst->u[1] = (unsigned)src->d[1];
189   dst->u[2] = (unsigned)src->d[2];
190   dst->u[3] = (unsigned)src->d[3];
191}
192static void
193micro_dabs(union tgsi_double_channel *dst,
194           const union tgsi_double_channel *src)
195{
196   dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
197   dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
198   dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
199   dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
200}
201
202static void
203micro_dadd(union tgsi_double_channel *dst,
204          const union tgsi_double_channel *src)
205{
206   dst->d[0] = src[0].d[0] + src[1].d[0];
207   dst->d[1] = src[0].d[1] + src[1].d[1];
208   dst->d[2] = src[0].d[2] + src[1].d[2];
209   dst->d[3] = src[0].d[3] + src[1].d[3];
210}
211
212static void
213micro_ddiv(union tgsi_double_channel *dst,
214          const union tgsi_double_channel *src)
215{
216   dst->d[0] = src[0].d[0] / src[1].d[0];
217   dst->d[1] = src[0].d[1] / src[1].d[1];
218   dst->d[2] = src[0].d[2] / src[1].d[2];
219   dst->d[3] = src[0].d[3] / src[1].d[3];
220}
221
222static void
223micro_ddx(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] =
227   dst->f[1] =
228   dst->f[2] =
229   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
230}
231
232static void
233micro_ddy(union tgsi_exec_channel *dst,
234          const union tgsi_exec_channel *src)
235{
236   dst->f[0] =
237   dst->f[1] =
238   dst->f[2] =
239   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
240}
241
242static void
243micro_dmul(union tgsi_double_channel *dst,
244           const union tgsi_double_channel *src)
245{
246   dst->d[0] = src[0].d[0] * src[1].d[0];
247   dst->d[1] = src[0].d[1] * src[1].d[1];
248   dst->d[2] = src[0].d[2] * src[1].d[2];
249   dst->d[3] = src[0].d[3] * src[1].d[3];
250}
251
252static void
253micro_dmax(union tgsi_double_channel *dst,
254           const union tgsi_double_channel *src)
255{
256   dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
257   dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
258   dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
259   dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
260}
261
262static void
263micro_dmin(union tgsi_double_channel *dst,
264           const union tgsi_double_channel *src)
265{
266   dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
267   dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
268   dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
269   dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
270}
271
272static void
273micro_dneg(union tgsi_double_channel *dst,
274           const union tgsi_double_channel *src)
275{
276   dst->d[0] = -src->d[0];
277   dst->d[1] = -src->d[1];
278   dst->d[2] = -src->d[2];
279   dst->d[3] = -src->d[3];
280}
281
282static void
283micro_dslt(union tgsi_double_channel *dst,
284           const union tgsi_double_channel *src)
285{
286   dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
287   dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
288   dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
289   dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
290}
291
292static void
293micro_dsne(union tgsi_double_channel *dst,
294           const union tgsi_double_channel *src)
295{
296   dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
297   dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
298   dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
299   dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
300}
301
302static void
303micro_dsge(union tgsi_double_channel *dst,
304           const union tgsi_double_channel *src)
305{
306   dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
307   dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
308   dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
309   dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
310}
311
312static void
313micro_dseq(union tgsi_double_channel *dst,
314           const union tgsi_double_channel *src)
315{
316   dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
317   dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
318   dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
319   dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
320}
321
322static void
323micro_drcp(union tgsi_double_channel *dst,
324           const union tgsi_double_channel *src)
325{
326   dst->d[0] = 1.0 / src->d[0];
327   dst->d[1] = 1.0 / src->d[1];
328   dst->d[2] = 1.0 / src->d[2];
329   dst->d[3] = 1.0 / src->d[3];
330}
331
332static void
333micro_dsqrt(union tgsi_double_channel *dst,
334            const union tgsi_double_channel *src)
335{
336   dst->d[0] = sqrt(src->d[0]);
337   dst->d[1] = sqrt(src->d[1]);
338   dst->d[2] = sqrt(src->d[2]);
339   dst->d[3] = sqrt(src->d[3]);
340}
341
342static void
343micro_drsq(union tgsi_double_channel *dst,
344          const union tgsi_double_channel *src)
345{
346   dst->d[0] = 1.0 / sqrt(src->d[0]);
347   dst->d[1] = 1.0 / sqrt(src->d[1]);
348   dst->d[2] = 1.0 / sqrt(src->d[2]);
349   dst->d[3] = 1.0 / sqrt(src->d[3]);
350}
351
352static void
353micro_dmad(union tgsi_double_channel *dst,
354           const union tgsi_double_channel *src)
355{
356   dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
357   dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
358   dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
359   dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
360}
361
362static void
363micro_dfrac(union tgsi_double_channel *dst,
364            const union tgsi_double_channel *src)
365{
366   dst->d[0] = src->d[0] - floor(src->d[0]);
367   dst->d[1] = src->d[1] - floor(src->d[1]);
368   dst->d[2] = src->d[2] - floor(src->d[2]);
369   dst->d[3] = src->d[3] - floor(src->d[3]);
370}
371
372static void
373micro_dldexp(union tgsi_double_channel *dst,
374             const union tgsi_double_channel *src0,
375             union tgsi_exec_channel *src1)
376{
377   dst->d[0] = ldexp(src0->d[0], src1->i[0]);
378   dst->d[1] = ldexp(src0->d[1], src1->i[1]);
379   dst->d[2] = ldexp(src0->d[2], src1->i[2]);
380   dst->d[3] = ldexp(src0->d[3], src1->i[3]);
381}
382
383static void
384micro_dfracexp(union tgsi_double_channel *dst,
385               union tgsi_exec_channel *dst_exp,
386               const union tgsi_double_channel *src)
387{
388   dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
389   dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
390   dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
391   dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
392}
393
394static void
395micro_exp2(union tgsi_exec_channel *dst,
396           const union tgsi_exec_channel *src)
397{
398#if FAST_MATH
399   dst->f[0] = util_fast_exp2(src->f[0]);
400   dst->f[1] = util_fast_exp2(src->f[1]);
401   dst->f[2] = util_fast_exp2(src->f[2]);
402   dst->f[3] = util_fast_exp2(src->f[3]);
403#else
404#if DEBUG
405   /* Inf is okay for this instruction, so clamp it to silence assertions. */
406   uint i;
407   union tgsi_exec_channel clamped;
408
409   for (i = 0; i < 4; i++) {
410      if (src->f[i] > 127.99999f) {
411         clamped.f[i] = 127.99999f;
412      } else if (src->f[i] < -126.99999f) {
413         clamped.f[i] = -126.99999f;
414      } else {
415         clamped.f[i] = src->f[i];
416      }
417   }
418   src = &clamped;
419#endif /* DEBUG */
420
421   dst->f[0] = powf(2.0f, src->f[0]);
422   dst->f[1] = powf(2.0f, src->f[1]);
423   dst->f[2] = powf(2.0f, src->f[2]);
424   dst->f[3] = powf(2.0f, src->f[3]);
425#endif /* FAST_MATH */
426}
427
428static void
429micro_f2d(union tgsi_double_channel *dst,
430          const union tgsi_exec_channel *src)
431{
432   dst->d[0] = (double)src->f[0];
433   dst->d[1] = (double)src->f[1];
434   dst->d[2] = (double)src->f[2];
435   dst->d[3] = (double)src->f[3];
436}
437
438static void
439micro_flr(union tgsi_exec_channel *dst,
440          const union tgsi_exec_channel *src)
441{
442   dst->f[0] = floorf(src->f[0]);
443   dst->f[1] = floorf(src->f[1]);
444   dst->f[2] = floorf(src->f[2]);
445   dst->f[3] = floorf(src->f[3]);
446}
447
448static void
449micro_frc(union tgsi_exec_channel *dst,
450          const union tgsi_exec_channel *src)
451{
452   dst->f[0] = src->f[0] - floorf(src->f[0]);
453   dst->f[1] = src->f[1] - floorf(src->f[1]);
454   dst->f[2] = src->f[2] - floorf(src->f[2]);
455   dst->f[3] = src->f[3] - floorf(src->f[3]);
456}
457
458static void
459micro_i2d(union tgsi_double_channel *dst,
460          const union tgsi_exec_channel *src)
461{
462   dst->d[0] = (double)src->i[0];
463   dst->d[1] = (double)src->i[1];
464   dst->d[2] = (double)src->i[2];
465   dst->d[3] = (double)src->i[3];
466}
467
468static void
469micro_iabs(union tgsi_exec_channel *dst,
470           const union tgsi_exec_channel *src)
471{
472   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
473   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
474   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
475   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
476}
477
478static void
479micro_ineg(union tgsi_exec_channel *dst,
480           const union tgsi_exec_channel *src)
481{
482   dst->i[0] = -src->i[0];
483   dst->i[1] = -src->i[1];
484   dst->i[2] = -src->i[2];
485   dst->i[3] = -src->i[3];
486}
487
488static void
489micro_lg2(union tgsi_exec_channel *dst,
490          const union tgsi_exec_channel *src)
491{
492#if FAST_MATH
493   dst->f[0] = util_fast_log2(src->f[0]);
494   dst->f[1] = util_fast_log2(src->f[1]);
495   dst->f[2] = util_fast_log2(src->f[2]);
496   dst->f[3] = util_fast_log2(src->f[3]);
497#else
498   dst->f[0] = logf(src->f[0]) * 1.442695f;
499   dst->f[1] = logf(src->f[1]) * 1.442695f;
500   dst->f[2] = logf(src->f[2]) * 1.442695f;
501   dst->f[3] = logf(src->f[3]) * 1.442695f;
502#endif
503}
504
505static void
506micro_lrp(union tgsi_exec_channel *dst,
507          const union tgsi_exec_channel *src0,
508          const union tgsi_exec_channel *src1,
509          const union tgsi_exec_channel *src2)
510{
511   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
512   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
513   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
514   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
515}
516
517static void
518micro_mad(union tgsi_exec_channel *dst,
519          const union tgsi_exec_channel *src0,
520          const union tgsi_exec_channel *src1,
521          const union tgsi_exec_channel *src2)
522{
523   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
524   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
525   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
526   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
527}
528
529static void
530micro_mov(union tgsi_exec_channel *dst,
531          const union tgsi_exec_channel *src)
532{
533   dst->u[0] = src->u[0];
534   dst->u[1] = src->u[1];
535   dst->u[2] = src->u[2];
536   dst->u[3] = src->u[3];
537}
538
539static void
540micro_rcp(union tgsi_exec_channel *dst,
541          const union tgsi_exec_channel *src)
542{
543#if 0 /* for debugging */
544   assert(src->f[0] != 0.0f);
545   assert(src->f[1] != 0.0f);
546   assert(src->f[2] != 0.0f);
547   assert(src->f[3] != 0.0f);
548#endif
549   dst->f[0] = 1.0f / src->f[0];
550   dst->f[1] = 1.0f / src->f[1];
551   dst->f[2] = 1.0f / src->f[2];
552   dst->f[3] = 1.0f / src->f[3];
553}
554
555static void
556micro_rnd(union tgsi_exec_channel *dst,
557          const union tgsi_exec_channel *src)
558{
559   dst->f[0] = _mesa_roundevenf(src->f[0]);
560   dst->f[1] = _mesa_roundevenf(src->f[1]);
561   dst->f[2] = _mesa_roundevenf(src->f[2]);
562   dst->f[3] = _mesa_roundevenf(src->f[3]);
563}
564
565static void
566micro_rsq(union tgsi_exec_channel *dst,
567          const union tgsi_exec_channel *src)
568{
569#if 0 /* for debugging */
570   assert(src->f[0] != 0.0f);
571   assert(src->f[1] != 0.0f);
572   assert(src->f[2] != 0.0f);
573   assert(src->f[3] != 0.0f);
574#endif
575   dst->f[0] = 1.0f / sqrtf(src->f[0]);
576   dst->f[1] = 1.0f / sqrtf(src->f[1]);
577   dst->f[2] = 1.0f / sqrtf(src->f[2]);
578   dst->f[3] = 1.0f / sqrtf(src->f[3]);
579}
580
581static void
582micro_sqrt(union tgsi_exec_channel *dst,
583           const union tgsi_exec_channel *src)
584{
585   dst->f[0] = sqrtf(src->f[0]);
586   dst->f[1] = sqrtf(src->f[1]);
587   dst->f[2] = sqrtf(src->f[2]);
588   dst->f[3] = sqrtf(src->f[3]);
589}
590
591static void
592micro_seq(union tgsi_exec_channel *dst,
593          const union tgsi_exec_channel *src0,
594          const union tgsi_exec_channel *src1)
595{
596   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
597   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
598   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
599   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
600}
601
602static void
603micro_sge(union tgsi_exec_channel *dst,
604          const union tgsi_exec_channel *src0,
605          const union tgsi_exec_channel *src1)
606{
607   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
608   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
609   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
610   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
611}
612
613static void
614micro_sgn(union tgsi_exec_channel *dst,
615          const union tgsi_exec_channel *src)
616{
617   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
618   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
619   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
620   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
621}
622
623static void
624micro_isgn(union tgsi_exec_channel *dst,
625          const union tgsi_exec_channel *src)
626{
627   dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
628   dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
629   dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
630   dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
631}
632
633static void
634micro_sgt(union tgsi_exec_channel *dst,
635          const union tgsi_exec_channel *src0,
636          const union tgsi_exec_channel *src1)
637{
638   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
639   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
640   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
641   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
642}
643
644static void
645micro_sin(union tgsi_exec_channel *dst,
646          const union tgsi_exec_channel *src)
647{
648   dst->f[0] = sinf(src->f[0]);
649   dst->f[1] = sinf(src->f[1]);
650   dst->f[2] = sinf(src->f[2]);
651   dst->f[3] = sinf(src->f[3]);
652}
653
654static void
655micro_sle(union tgsi_exec_channel *dst,
656          const union tgsi_exec_channel *src0,
657          const union tgsi_exec_channel *src1)
658{
659   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
660   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
661   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
662   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
663}
664
665static void
666micro_slt(union tgsi_exec_channel *dst,
667          const union tgsi_exec_channel *src0,
668          const union tgsi_exec_channel *src1)
669{
670   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
671   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
672   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
673   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
674}
675
676static void
677micro_sne(union tgsi_exec_channel *dst,
678          const union tgsi_exec_channel *src0,
679          const union tgsi_exec_channel *src1)
680{
681   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
682   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
683   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
684   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
685}
686
687static void
688micro_trunc(union tgsi_exec_channel *dst,
689            const union tgsi_exec_channel *src)
690{
691   dst->f[0] = truncf(src->f[0]);
692   dst->f[1] = truncf(src->f[1]);
693   dst->f[2] = truncf(src->f[2]);
694   dst->f[3] = truncf(src->f[3]);
695}
696
697static void
698micro_u2d(union tgsi_double_channel *dst,
699          const union tgsi_exec_channel *src)
700{
701   dst->d[0] = (double)src->u[0];
702   dst->d[1] = (double)src->u[1];
703   dst->d[2] = (double)src->u[2];
704   dst->d[3] = (double)src->u[3];
705}
706
707static void
708micro_i64abs(union tgsi_double_channel *dst,
709             const union tgsi_double_channel *src)
710{
711   dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
712   dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
713   dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
714   dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
715}
716
717static void
718micro_i64sgn(union tgsi_double_channel *dst,
719             const union tgsi_double_channel *src)
720{
721   dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
722   dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
723   dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
724   dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
725}
726
727static void
728micro_i64neg(union tgsi_double_channel *dst,
729             const union tgsi_double_channel *src)
730{
731   dst->i64[0] = -src->i64[0];
732   dst->i64[1] = -src->i64[1];
733   dst->i64[2] = -src->i64[2];
734   dst->i64[3] = -src->i64[3];
735}
736
737static void
738micro_u64seq(union tgsi_double_channel *dst,
739           const union tgsi_double_channel *src)
740{
741   dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
742   dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
743   dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
744   dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
745}
746
747static void
748micro_u64sne(union tgsi_double_channel *dst,
749             const union tgsi_double_channel *src)
750{
751   dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
752   dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
753   dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
754   dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
755}
756
757static void
758micro_i64slt(union tgsi_double_channel *dst,
759             const union tgsi_double_channel *src)
760{
761   dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
762   dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
763   dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
764   dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
765}
766
767static void
768micro_u64slt(union tgsi_double_channel *dst,
769             const union tgsi_double_channel *src)
770{
771   dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
772   dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
773   dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
774   dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
775}
776
777static void
778micro_i64sge(union tgsi_double_channel *dst,
779           const union tgsi_double_channel *src)
780{
781   dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
782   dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
783   dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
784   dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
785}
786
787static void
788micro_u64sge(union tgsi_double_channel *dst,
789             const union tgsi_double_channel *src)
790{
791   dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
792   dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
793   dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
794   dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
795}
796
797static void
798micro_u64max(union tgsi_double_channel *dst,
799             const union tgsi_double_channel *src)
800{
801   dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
802   dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
803   dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
804   dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
805}
806
807static void
808micro_i64max(union tgsi_double_channel *dst,
809             const union tgsi_double_channel *src)
810{
811   dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
812   dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
813   dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
814   dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
815}
816
817static void
818micro_u64min(union tgsi_double_channel *dst,
819             const union tgsi_double_channel *src)
820{
821   dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
822   dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
823   dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
824   dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
825}
826
827static void
828micro_i64min(union tgsi_double_channel *dst,
829             const union tgsi_double_channel *src)
830{
831   dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
832   dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
833   dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
834   dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
835}
836
837static void
838micro_u64add(union tgsi_double_channel *dst,
839             const union tgsi_double_channel *src)
840{
841   dst->u64[0] = src[0].u64[0] + src[1].u64[0];
842   dst->u64[1] = src[0].u64[1] + src[1].u64[1];
843   dst->u64[2] = src[0].u64[2] + src[1].u64[2];
844   dst->u64[3] = src[0].u64[3] + src[1].u64[3];
845}
846
847static void
848micro_u64mul(union tgsi_double_channel *dst,
849             const union tgsi_double_channel *src)
850{
851   dst->u64[0] = src[0].u64[0] * src[1].u64[0];
852   dst->u64[1] = src[0].u64[1] * src[1].u64[1];
853   dst->u64[2] = src[0].u64[2] * src[1].u64[2];
854   dst->u64[3] = src[0].u64[3] * src[1].u64[3];
855}
856
857static void
858micro_u64div(union tgsi_double_channel *dst,
859             const union tgsi_double_channel *src)
860{
861   dst->u64[0] = src[0].u64[0] / src[1].u64[0];
862   dst->u64[1] = src[0].u64[1] / src[1].u64[1];
863   dst->u64[2] = src[0].u64[2] / src[1].u64[2];
864   dst->u64[3] = src[0].u64[3] / src[1].u64[3];
865}
866
867static void
868micro_i64div(union tgsi_double_channel *dst,
869             const union tgsi_double_channel *src)
870{
871   dst->i64[0] = src[0].i64[0] / src[1].i64[0];
872   dst->i64[1] = src[0].i64[1] / src[1].i64[1];
873   dst->i64[2] = src[0].i64[2] / src[1].i64[2];
874   dst->i64[3] = src[0].i64[3] / src[1].i64[3];
875}
876
877static void
878micro_u64mod(union tgsi_double_channel *dst,
879             const union tgsi_double_channel *src)
880{
881   dst->u64[0] = src[0].u64[0] % src[1].u64[0];
882   dst->u64[1] = src[0].u64[1] % src[1].u64[1];
883   dst->u64[2] = src[0].u64[2] % src[1].u64[2];
884   dst->u64[3] = src[0].u64[3] % src[1].u64[3];
885}
886
887static void
888micro_i64mod(union tgsi_double_channel *dst,
889             const union tgsi_double_channel *src)
890{
891   dst->i64[0] = src[0].i64[0] % src[1].i64[0];
892   dst->i64[1] = src[0].i64[1] % src[1].i64[1];
893   dst->i64[2] = src[0].i64[2] % src[1].i64[2];
894   dst->i64[3] = src[0].i64[3] % src[1].i64[3];
895}
896
897static void
898micro_u64shl(union tgsi_double_channel *dst,
899             const union tgsi_double_channel *src0,
900             union tgsi_exec_channel *src1)
901{
902   unsigned masked_count;
903   masked_count = src1->u[0] & 0x3f;
904   dst->u64[0] = src0->u64[0] << masked_count;
905   masked_count = src1->u[1] & 0x3f;
906   dst->u64[1] = src0->u64[1] << masked_count;
907   masked_count = src1->u[2] & 0x3f;
908   dst->u64[2] = src0->u64[2] << masked_count;
909   masked_count = src1->u[3] & 0x3f;
910   dst->u64[3] = src0->u64[3] << masked_count;
911}
912
913static void
914micro_i64shr(union tgsi_double_channel *dst,
915             const union tgsi_double_channel *src0,
916             union tgsi_exec_channel *src1)
917{
918   unsigned masked_count;
919   masked_count = src1->u[0] & 0x3f;
920   dst->i64[0] = src0->i64[0] >> masked_count;
921   masked_count = src1->u[1] & 0x3f;
922   dst->i64[1] = src0->i64[1] >> masked_count;
923   masked_count = src1->u[2] & 0x3f;
924   dst->i64[2] = src0->i64[2] >> masked_count;
925   masked_count = src1->u[3] & 0x3f;
926   dst->i64[3] = src0->i64[3] >> masked_count;
927}
928
929static void
930micro_u64shr(union tgsi_double_channel *dst,
931             const union tgsi_double_channel *src0,
932             union tgsi_exec_channel *src1)
933{
934   unsigned masked_count;
935   masked_count = src1->u[0] & 0x3f;
936   dst->u64[0] = src0->u64[0] >> masked_count;
937   masked_count = src1->u[1] & 0x3f;
938   dst->u64[1] = src0->u64[1] >> masked_count;
939   masked_count = src1->u[2] & 0x3f;
940   dst->u64[2] = src0->u64[2] >> masked_count;
941   masked_count = src1->u[3] & 0x3f;
942   dst->u64[3] = src0->u64[3] >> masked_count;
943}
944
945enum tgsi_exec_datatype {
946   TGSI_EXEC_DATA_FLOAT,
947   TGSI_EXEC_DATA_INT,
948   TGSI_EXEC_DATA_UINT,
949   TGSI_EXEC_DATA_DOUBLE,
950   TGSI_EXEC_DATA_INT64,
951   TGSI_EXEC_DATA_UINT64,
952};
953
954/*
955 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
956 */
957#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
958#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
959#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
960#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
961#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
962#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
963
964
965/** The execution mask depends on the conditional mask and the loop mask */
966#define UPDATE_EXEC_MASK(MACH) \
967      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
968
969
970static const union tgsi_exec_channel ZeroVec =
971   { { 0.0, 0.0, 0.0, 0.0 } };
972
973static const union tgsi_exec_channel OneVec = {
974   {1.0f, 1.0f, 1.0f, 1.0f}
975};
976
977static const union tgsi_exec_channel P128Vec = {
978   {128.0f, 128.0f, 128.0f, 128.0f}
979};
980
981static const union tgsi_exec_channel M128Vec = {
982   {-128.0f, -128.0f, -128.0f, -128.0f}
983};
984
985
986/**
987 * Assert that none of the float values in 'chan' are infinite or NaN.
988 * NaN and Inf may occur normally during program execution and should
989 * not lead to crashes, etc.  But when debugging, it's helpful to catch
990 * them.
991 */
992static inline void
993check_inf_or_nan(const union tgsi_exec_channel *chan)
994{
995   assert(!util_is_inf_or_nan((chan)->f[0]));
996   assert(!util_is_inf_or_nan((chan)->f[1]));
997   assert(!util_is_inf_or_nan((chan)->f[2]));
998   assert(!util_is_inf_or_nan((chan)->f[3]));
999}
1000
1001
1002#ifdef DEBUG
1003static void
1004print_chan(const char *msg, const union tgsi_exec_channel *chan)
1005{
1006   debug_printf("%s = {%f, %f, %f, %f}\n",
1007                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
1008}
1009#endif
1010
1011
1012#ifdef DEBUG
1013static void
1014print_temp(const struct tgsi_exec_machine *mach, uint index)
1015{
1016   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
1017   int i;
1018   debug_printf("Temp[%u] =\n", index);
1019   for (i = 0; i < 4; i++) {
1020      debug_printf("  %c: { %f, %f, %f, %f }\n",
1021                   "XYZW"[i],
1022                   tmp->xyzw[i].f[0],
1023                   tmp->xyzw[i].f[1],
1024                   tmp->xyzw[i].f[2],
1025                   tmp->xyzw[i].f[3]);
1026   }
1027}
1028#endif
1029
1030
1031void
1032tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
1033                               unsigned num_bufs,
1034                               const void **bufs,
1035                               const unsigned *buf_sizes)
1036{
1037   unsigned i;
1038
1039   for (i = 0; i < num_bufs; i++) {
1040      mach->Consts[i] = bufs[i];
1041      mach->ConstsSize[i] = buf_sizes[i];
1042   }
1043}
1044
1045
1046/**
1047 * Check if there's a potential src/dst register data dependency when
1048 * using SOA execution.
1049 * Example:
1050 *   MOV T, T.yxwz;
1051 * This would expand into:
1052 *   MOV t0, t1;
1053 *   MOV t1, t0;
1054 *   MOV t2, t3;
1055 *   MOV t3, t2;
1056 * The second instruction will have the wrong value for t0 if executed as-is.
1057 */
1058boolean
1059tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
1060{
1061   uint i, chan;
1062
1063   uint writemask = inst->Dst[0].Register.WriteMask;
1064   if (writemask == TGSI_WRITEMASK_X ||
1065       writemask == TGSI_WRITEMASK_Y ||
1066       writemask == TGSI_WRITEMASK_Z ||
1067       writemask == TGSI_WRITEMASK_W ||
1068       writemask == TGSI_WRITEMASK_NONE) {
1069      /* no chance of data dependency */
1070      return FALSE;
1071   }
1072
1073   /* loop over src regs */
1074   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1075      if ((inst->Src[i].Register.File ==
1076           inst->Dst[0].Register.File) &&
1077          ((inst->Src[i].Register.Index ==
1078            inst->Dst[0].Register.Index) ||
1079           inst->Src[i].Register.Indirect ||
1080           inst->Dst[0].Register.Indirect)) {
1081         /* loop over dest channels */
1082         uint channelsWritten = 0x0;
1083         for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1084            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1085               /* check if we're reading a channel that's been written */
1086               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
1087               if (channelsWritten & (1 << swizzle)) {
1088                  return TRUE;
1089               }
1090
1091               channelsWritten |= (1 << chan);
1092            }
1093         }
1094      }
1095   }
1096   return FALSE;
1097}
1098
1099
1100/**
1101 * Initialize machine state by expanding tokens to full instructions,
1102 * allocating temporary storage, setting up constants, etc.
1103 * After this, we can call tgsi_exec_machine_run() many times.
1104 */
1105void
1106tgsi_exec_machine_bind_shader(
1107   struct tgsi_exec_machine *mach,
1108   const struct tgsi_token *tokens,
1109   struct tgsi_sampler *sampler,
1110   struct tgsi_image *image,
1111   struct tgsi_buffer *buffer)
1112{
1113   uint k;
1114   struct tgsi_parse_context parse;
1115   struct tgsi_full_instruction *instructions;
1116   struct tgsi_full_declaration *declarations;
1117   uint maxInstructions = 10, numInstructions = 0;
1118   uint maxDeclarations = 10, numDeclarations = 0;
1119
1120#if 0
1121   tgsi_dump(tokens, 0);
1122#endif
1123
1124   util_init_math();
1125
1126
1127   mach->Tokens = tokens;
1128   mach->Sampler = sampler;
1129   mach->Image = image;
1130   mach->Buffer = buffer;
1131
1132   if (!tokens) {
1133      /* unbind and free all */
1134      FREE(mach->Declarations);
1135      mach->Declarations = NULL;
1136      mach->NumDeclarations = 0;
1137
1138      FREE(mach->Instructions);
1139      mach->Instructions = NULL;
1140      mach->NumInstructions = 0;
1141
1142      return;
1143   }
1144
1145   k = tgsi_parse_init (&parse, mach->Tokens);
1146   if (k != TGSI_PARSE_OK) {
1147      debug_printf( "Problem parsing!\n" );
1148      return;
1149   }
1150
1151   mach->ImmLimit = 0;
1152   mach->NumOutputs = 0;
1153
1154   for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
1155      mach->SysSemanticToIndex[k] = -1;
1156
1157   if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
1158       !mach->UsedGeometryShader) {
1159      struct tgsi_exec_vector *inputs;
1160      struct tgsi_exec_vector *outputs;
1161
1162      inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1163                            TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
1164                            16);
1165
1166      if (!inputs)
1167         return;
1168
1169      outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1170                             TGSI_MAX_TOTAL_VERTICES, 16);
1171
1172      if (!outputs) {
1173         align_free(inputs);
1174         return;
1175      }
1176
1177      align_free(mach->Inputs);
1178      align_free(mach->Outputs);
1179
1180      mach->Inputs = inputs;
1181      mach->Outputs = outputs;
1182      mach->UsedGeometryShader = TRUE;
1183   }
1184
1185   declarations = (struct tgsi_full_declaration *)
1186      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
1187
1188   if (!declarations) {
1189      return;
1190   }
1191
1192   instructions = (struct tgsi_full_instruction *)
1193      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
1194
1195   if (!instructions) {
1196      FREE( declarations );
1197      return;
1198   }
1199
1200   while( !tgsi_parse_end_of_tokens( &parse ) ) {
1201      uint i;
1202
1203      tgsi_parse_token( &parse );
1204      switch( parse.FullToken.Token.Type ) {
1205      case TGSI_TOKEN_TYPE_DECLARATION:
1206         /* save expanded declaration */
1207         if (numDeclarations == maxDeclarations) {
1208            declarations = REALLOC(declarations,
1209                                   maxDeclarations
1210                                   * sizeof(struct tgsi_full_declaration),
1211                                   (maxDeclarations + 10)
1212                                   * sizeof(struct tgsi_full_declaration));
1213            maxDeclarations += 10;
1214         }
1215         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
1216            unsigned reg;
1217            for (reg = parse.FullToken.FullDeclaration.Range.First;
1218                 reg <= parse.FullToken.FullDeclaration.Range.Last;
1219                 ++reg) {
1220               ++mach->NumOutputs;
1221            }
1222         }
1223         else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1224            const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
1225            mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
1226         }
1227
1228         memcpy(declarations + numDeclarations,
1229                &parse.FullToken.FullDeclaration,
1230                sizeof(declarations[0]));
1231         numDeclarations++;
1232         break;
1233
1234      case TGSI_TOKEN_TYPE_IMMEDIATE:
1235         {
1236            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1237            assert( size <= 4 );
1238            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
1239
1240            for( i = 0; i < size; i++ ) {
1241               mach->Imms[mach->ImmLimit][i] =
1242		  parse.FullToken.FullImmediate.u[i].Float;
1243            }
1244            mach->ImmLimit += 1;
1245         }
1246         break;
1247
1248      case TGSI_TOKEN_TYPE_INSTRUCTION:
1249
1250         /* save expanded instruction */
1251         if (numInstructions == maxInstructions) {
1252            instructions = REALLOC(instructions,
1253                                   maxInstructions
1254                                   * sizeof(struct tgsi_full_instruction),
1255                                   (maxInstructions + 10)
1256                                   * sizeof(struct tgsi_full_instruction));
1257            maxInstructions += 10;
1258         }
1259
1260         memcpy(instructions + numInstructions,
1261                &parse.FullToken.FullInstruction,
1262                sizeof(instructions[0]));
1263
1264         numInstructions++;
1265         break;
1266
1267      case TGSI_TOKEN_TYPE_PROPERTY:
1268         if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
1269            if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
1270               mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
1271            }
1272         }
1273         break;
1274
1275      default:
1276         assert( 0 );
1277      }
1278   }
1279   tgsi_parse_free (&parse);
1280
1281   FREE(mach->Declarations);
1282   mach->Declarations = declarations;
1283   mach->NumDeclarations = numDeclarations;
1284
1285   FREE(mach->Instructions);
1286   mach->Instructions = instructions;
1287   mach->NumInstructions = numInstructions;
1288}
1289
1290
1291struct tgsi_exec_machine *
1292tgsi_exec_machine_create(enum pipe_shader_type shader_type)
1293{
1294   struct tgsi_exec_machine *mach;
1295   uint i;
1296
1297   mach = align_malloc( sizeof *mach, 16 );
1298   if (!mach)
1299      goto fail;
1300
1301   memset(mach, 0, sizeof(*mach));
1302
1303   mach->ShaderType = shader_type;
1304   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
1305   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
1306   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
1307
1308   if (shader_type != PIPE_SHADER_COMPUTE) {
1309      mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
1310      mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
1311      if (!mach->Inputs || !mach->Outputs)
1312         goto fail;
1313   }
1314
1315   /* Setup constants needed by the SSE2 executor. */
1316   for( i = 0; i < 4; i++ ) {
1317      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
1318      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
1319      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
1320      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
1321      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
1322      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
1323      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
1324      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
1325      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
1326      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
1327   }
1328
1329#ifdef DEBUG
1330   /* silence warnings */
1331   (void) print_chan;
1332   (void) print_temp;
1333#endif
1334
1335   return mach;
1336
1337fail:
1338   if (mach) {
1339      align_free(mach->Inputs);
1340      align_free(mach->Outputs);
1341      align_free(mach);
1342   }
1343   return NULL;
1344}
1345
1346
1347void
1348tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
1349{
1350   if (mach) {
1351      FREE(mach->Instructions);
1352      FREE(mach->Declarations);
1353
1354      align_free(mach->Inputs);
1355      align_free(mach->Outputs);
1356
1357      align_free(mach);
1358   }
1359}
1360
1361static void
1362micro_add(union tgsi_exec_channel *dst,
1363          const union tgsi_exec_channel *src0,
1364          const union tgsi_exec_channel *src1)
1365{
1366   dst->f[0] = src0->f[0] + src1->f[0];
1367   dst->f[1] = src0->f[1] + src1->f[1];
1368   dst->f[2] = src0->f[2] + src1->f[2];
1369   dst->f[3] = src0->f[3] + src1->f[3];
1370}
1371
1372static void
1373micro_div(
1374   union tgsi_exec_channel *dst,
1375   const union tgsi_exec_channel *src0,
1376   const union tgsi_exec_channel *src1 )
1377{
1378   if (src1->f[0] != 0) {
1379      dst->f[0] = src0->f[0] / src1->f[0];
1380   }
1381   if (src1->f[1] != 0) {
1382      dst->f[1] = src0->f[1] / src1->f[1];
1383   }
1384   if (src1->f[2] != 0) {
1385      dst->f[2] = src0->f[2] / src1->f[2];
1386   }
1387   if (src1->f[3] != 0) {
1388      dst->f[3] = src0->f[3] / src1->f[3];
1389   }
1390}
1391
1392static void
1393micro_lt(
1394   union tgsi_exec_channel *dst,
1395   const union tgsi_exec_channel *src0,
1396   const union tgsi_exec_channel *src1,
1397   const union tgsi_exec_channel *src2,
1398   const union tgsi_exec_channel *src3 )
1399{
1400   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
1401   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
1402   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
1403   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
1404}
1405
1406static void
1407micro_max(union tgsi_exec_channel *dst,
1408          const union tgsi_exec_channel *src0,
1409          const union tgsi_exec_channel *src1)
1410{
1411   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
1412   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
1413   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
1414   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
1415}
1416
1417static void
1418micro_min(union tgsi_exec_channel *dst,
1419          const union tgsi_exec_channel *src0,
1420          const union tgsi_exec_channel *src1)
1421{
1422   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
1423   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
1424   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
1425   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
1426}
1427
1428static void
1429micro_mul(union tgsi_exec_channel *dst,
1430          const union tgsi_exec_channel *src0,
1431          const union tgsi_exec_channel *src1)
1432{
1433   dst->f[0] = src0->f[0] * src1->f[0];
1434   dst->f[1] = src0->f[1] * src1->f[1];
1435   dst->f[2] = src0->f[2] * src1->f[2];
1436   dst->f[3] = src0->f[3] * src1->f[3];
1437}
1438
1439static void
1440micro_neg(
1441   union tgsi_exec_channel *dst,
1442   const union tgsi_exec_channel *src )
1443{
1444   dst->f[0] = -src->f[0];
1445   dst->f[1] = -src->f[1];
1446   dst->f[2] = -src->f[2];
1447   dst->f[3] = -src->f[3];
1448}
1449
1450static void
1451micro_pow(
1452   union tgsi_exec_channel *dst,
1453   const union tgsi_exec_channel *src0,
1454   const union tgsi_exec_channel *src1 )
1455{
1456#if FAST_MATH
1457   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1458   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1459   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1460   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1461#else
1462   dst->f[0] = powf( src0->f[0], src1->f[0] );
1463   dst->f[1] = powf( src0->f[1], src1->f[1] );
1464   dst->f[2] = powf( src0->f[2], src1->f[2] );
1465   dst->f[3] = powf( src0->f[3], src1->f[3] );
1466#endif
1467}
1468
1469static void
1470micro_sub(union tgsi_exec_channel *dst,
1471          const union tgsi_exec_channel *src0,
1472          const union tgsi_exec_channel *src1)
1473{
1474   dst->f[0] = src0->f[0] - src1->f[0];
1475   dst->f[1] = src0->f[1] - src1->f[1];
1476   dst->f[2] = src0->f[2] - src1->f[2];
1477   dst->f[3] = src0->f[3] - src1->f[3];
1478}
1479
1480static void
1481fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1482                       const uint chan_index,
1483                       const uint file,
1484                       const uint swizzle,
1485                       const union tgsi_exec_channel *index,
1486                       const union tgsi_exec_channel *index2D,
1487                       union tgsi_exec_channel *chan)
1488{
1489   uint i;
1490
1491   assert(swizzle < 4);
1492
1493   switch (file) {
1494   case TGSI_FILE_CONSTANT:
1495      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1496         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1497         assert(mach->Consts[index2D->i[i]]);
1498
1499         if (index->i[i] < 0) {
1500            chan->u[i] = 0;
1501         } else {
1502            /* NOTE: copying the const value as a uint instead of float */
1503            const uint constbuf = index2D->i[i];
1504            const uint *buf = (const uint *)mach->Consts[constbuf];
1505            const int pos = index->i[i] * 4 + swizzle;
1506            /* const buffer bounds check */
1507            if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
1508               if (0) {
1509                  /* Debug: print warning */
1510                  static int count = 0;
1511                  if (count++ < 100)
1512                     debug_printf("TGSI Exec: const buffer index %d"
1513                                  " out of bounds\n", pos);
1514               }
1515               chan->u[i] = 0;
1516            }
1517            else
1518               chan->u[i] = buf[pos];
1519         }
1520      }
1521      break;
1522
1523   case TGSI_FILE_INPUT:
1524      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1525         /*
1526         if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1527            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1528                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1529                         index2D->i[i], index->i[i]);
1530                         }*/
1531         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1532         assert(pos >= 0);
1533         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1534         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1535      }
1536      break;
1537
1538   case TGSI_FILE_SYSTEM_VALUE:
1539      /* XXX no swizzling at this point.  Will be needed if we put
1540       * gl_FragCoord, for example, in a sys value register.
1541       */
1542      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1543         chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
1544      }
1545      break;
1546
1547   case TGSI_FILE_TEMPORARY:
1548      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1549         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1550         assert(index2D->i[i] == 0);
1551
1552         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1553      }
1554      break;
1555
1556   case TGSI_FILE_IMMEDIATE:
1557      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1558         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1559         assert(index2D->i[i] == 0);
1560
1561         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1562      }
1563      break;
1564
1565   case TGSI_FILE_ADDRESS:
1566      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1567         assert(index->i[i] >= 0);
1568         assert(index2D->i[i] == 0);
1569
1570         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1571      }
1572      break;
1573
1574   case TGSI_FILE_PREDICATE:
1575      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1576         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1577         assert(index2D->i[i] == 0);
1578
1579         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1580      }
1581      break;
1582
1583   case TGSI_FILE_OUTPUT:
1584      /* vertex/fragment output vars can be read too */
1585      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1586         assert(index->i[i] >= 0);
1587         assert(index2D->i[i] == 0);
1588
1589         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1590      }
1591      break;
1592
1593   default:
1594      assert(0);
1595      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1596         chan->u[i] = 0;
1597      }
1598   }
1599}
1600
1601static void
1602fetch_source_d(const struct tgsi_exec_machine *mach,
1603               union tgsi_exec_channel *chan,
1604               const struct tgsi_full_src_register *reg,
1605               const uint chan_index,
1606               enum tgsi_exec_datatype src_datatype)
1607{
1608   union tgsi_exec_channel index;
1609   union tgsi_exec_channel index2D;
1610   uint swizzle;
1611
1612   /* We start with a direct index into a register file.
1613    *
1614    *    file[1],
1615    *    where:
1616    *       file = Register.File
1617    *       [1] = Register.Index
1618    */
1619   index.i[0] =
1620   index.i[1] =
1621   index.i[2] =
1622   index.i[3] = reg->Register.Index;
1623
1624   /* There is an extra source register that indirectly subscripts
1625    * a register file. The direct index now becomes an offset
1626    * that is being added to the indirect register.
1627    *
1628    *    file[ind[2].x+1],
1629    *    where:
1630    *       ind = Indirect.File
1631    *       [2] = Indirect.Index
1632    *       .x = Indirect.SwizzleX
1633    */
1634   if (reg->Register.Indirect) {
1635      union tgsi_exec_channel index2;
1636      union tgsi_exec_channel indir_index;
1637      const uint execmask = mach->ExecMask;
1638      uint i;
1639
1640      /* which address register (always zero now) */
1641      index2.i[0] =
1642      index2.i[1] =
1643      index2.i[2] =
1644      index2.i[3] = reg->Indirect.Index;
1645      /* get current value of address register[swizzle] */
1646      swizzle = reg->Indirect.Swizzle;
1647      fetch_src_file_channel(mach,
1648                             chan_index,
1649                             reg->Indirect.File,
1650                             swizzle,
1651                             &index2,
1652                             &ZeroVec,
1653                             &indir_index);
1654
1655      /* add value of address register to the offset */
1656      index.i[0] += indir_index.i[0];
1657      index.i[1] += indir_index.i[1];
1658      index.i[2] += indir_index.i[2];
1659      index.i[3] += indir_index.i[3];
1660
1661      /* for disabled execution channels, zero-out the index to
1662       * avoid using a potential garbage value.
1663       */
1664      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1665         if ((execmask & (1 << i)) == 0)
1666            index.i[i] = 0;
1667      }
1668   }
1669
1670   /* There is an extra source register that is a second
1671    * subscript to a register file. Effectively it means that
1672    * the register file is actually a 2D array of registers.
1673    *
1674    *    file[3][1],
1675    *    where:
1676    *       [3] = Dimension.Index
1677    */
1678   if (reg->Register.Dimension) {
1679      index2D.i[0] =
1680      index2D.i[1] =
1681      index2D.i[2] =
1682      index2D.i[3] = reg->Dimension.Index;
1683
1684      /* Again, the second subscript index can be addressed indirectly
1685       * identically to the first one.
1686       * Nothing stops us from indirectly addressing the indirect register,
1687       * but there is no need for that, so we won't exercise it.
1688       *
1689       *    file[ind[4].y+3][1],
1690       *    where:
1691       *       ind = DimIndirect.File
1692       *       [4] = DimIndirect.Index
1693       *       .y = DimIndirect.SwizzleX
1694       */
1695      if (reg->Dimension.Indirect) {
1696         union tgsi_exec_channel index2;
1697         union tgsi_exec_channel indir_index;
1698         const uint execmask = mach->ExecMask;
1699         uint i;
1700
1701         index2.i[0] =
1702         index2.i[1] =
1703         index2.i[2] =
1704         index2.i[3] = reg->DimIndirect.Index;
1705
1706         swizzle = reg->DimIndirect.Swizzle;
1707         fetch_src_file_channel(mach,
1708                                chan_index,
1709                                reg->DimIndirect.File,
1710                                swizzle,
1711                                &index2,
1712                                &ZeroVec,
1713                                &indir_index);
1714
1715         index2D.i[0] += indir_index.i[0];
1716         index2D.i[1] += indir_index.i[1];
1717         index2D.i[2] += indir_index.i[2];
1718         index2D.i[3] += indir_index.i[3];
1719
1720         /* for disabled execution channels, zero-out the index to
1721          * avoid using a potential garbage value.
1722          */
1723         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1724            if ((execmask & (1 << i)) == 0) {
1725               index2D.i[i] = 0;
1726            }
1727         }
1728      }
1729
1730      /* If by any chance there was a need for a 3D array of register
1731       * files, we would have to check whether Dimension is followed
1732       * by a dimension register and continue the saga.
1733       */
1734   } else {
1735      index2D.i[0] =
1736      index2D.i[1] =
1737      index2D.i[2] =
1738      index2D.i[3] = 0;
1739   }
1740
1741   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1742   fetch_src_file_channel(mach,
1743                          chan_index,
1744                          reg->Register.File,
1745                          swizzle,
1746                          &index,
1747                          &index2D,
1748                          chan);
1749}
1750
1751static void
1752fetch_source(const struct tgsi_exec_machine *mach,
1753             union tgsi_exec_channel *chan,
1754             const struct tgsi_full_src_register *reg,
1755             const uint chan_index,
1756             enum tgsi_exec_datatype src_datatype)
1757{
1758   fetch_source_d(mach, chan, reg, chan_index, src_datatype);
1759
1760   if (reg->Register.Absolute) {
1761      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1762         micro_abs(chan, chan);
1763      } else {
1764         micro_iabs(chan, chan);
1765      }
1766   }
1767
1768   if (reg->Register.Negate) {
1769      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1770         micro_neg(chan, chan);
1771      } else {
1772         micro_ineg(chan, chan);
1773      }
1774   }
1775}
1776
1777static union tgsi_exec_channel *
1778store_dest_dstret(struct tgsi_exec_machine *mach,
1779                 const union tgsi_exec_channel *chan,
1780                 const struct tgsi_full_dst_register *reg,
1781                 const struct tgsi_full_instruction *inst,
1782                 uint chan_index,
1783                 enum tgsi_exec_datatype dst_datatype)
1784{
1785   uint i;
1786   static union tgsi_exec_channel null;
1787   union tgsi_exec_channel *dst;
1788   union tgsi_exec_channel index2D;
1789   uint execmask = mach->ExecMask;
1790   int offset = 0;  /* indirection offset */
1791   int index;
1792
1793   /* for debugging */
1794   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1795      check_inf_or_nan(chan);
1796   }
1797
1798   /* There is an extra source register that indirectly subscripts
1799    * a register file. The direct index now becomes an offset
1800    * that is being added to the indirect register.
1801    *
1802    *    file[ind[2].x+1],
1803    *    where:
1804    *       ind = Indirect.File
1805    *       [2] = Indirect.Index
1806    *       .x = Indirect.SwizzleX
1807    */
1808   if (reg->Register.Indirect) {
1809      union tgsi_exec_channel index;
1810      union tgsi_exec_channel indir_index;
1811      uint swizzle;
1812
1813      /* which address register (always zero for now) */
1814      index.i[0] =
1815      index.i[1] =
1816      index.i[2] =
1817      index.i[3] = reg->Indirect.Index;
1818
1819      /* get current value of address register[swizzle] */
1820      swizzle = reg->Indirect.Swizzle;
1821
1822      /* fetch values from the address/indirection register */
1823      fetch_src_file_channel(mach,
1824                             chan_index,
1825                             reg->Indirect.File,
1826                             swizzle,
1827                             &index,
1828                             &ZeroVec,
1829                             &indir_index);
1830
1831      /* save indirection offset */
1832      offset = indir_index.i[0];
1833   }
1834
1835   /* There is an extra source register that is a second
1836    * subscript to a register file. Effectively it means that
1837    * the register file is actually a 2D array of registers.
1838    *
1839    *    file[3][1],
1840    *    where:
1841    *       [3] = Dimension.Index
1842    */
1843   if (reg->Register.Dimension) {
1844      index2D.i[0] =
1845      index2D.i[1] =
1846      index2D.i[2] =
1847      index2D.i[3] = reg->Dimension.Index;
1848
1849      /* Again, the second subscript index can be addressed indirectly
1850       * identically to the first one.
1851       * Nothing stops us from indirectly addressing the indirect register,
1852       * but there is no need for that, so we won't exercise it.
1853       *
1854       *    file[ind[4].y+3][1],
1855       *    where:
1856       *       ind = DimIndirect.File
1857       *       [4] = DimIndirect.Index
1858       *       .y = DimIndirect.SwizzleX
1859       */
1860      if (reg->Dimension.Indirect) {
1861         union tgsi_exec_channel index2;
1862         union tgsi_exec_channel indir_index;
1863         const uint execmask = mach->ExecMask;
1864         unsigned swizzle;
1865         uint i;
1866
1867         index2.i[0] =
1868         index2.i[1] =
1869         index2.i[2] =
1870         index2.i[3] = reg->DimIndirect.Index;
1871
1872         swizzle = reg->DimIndirect.Swizzle;
1873         fetch_src_file_channel(mach,
1874                                chan_index,
1875                                reg->DimIndirect.File,
1876                                swizzle,
1877                                &index2,
1878                                &ZeroVec,
1879                                &indir_index);
1880
1881         index2D.i[0] += indir_index.i[0];
1882         index2D.i[1] += indir_index.i[1];
1883         index2D.i[2] += indir_index.i[2];
1884         index2D.i[3] += indir_index.i[3];
1885
1886         /* for disabled execution channels, zero-out the index to
1887          * avoid using a potential garbage value.
1888          */
1889         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1890            if ((execmask & (1 << i)) == 0) {
1891               index2D.i[i] = 0;
1892            }
1893         }
1894      }
1895
1896      /* If by any chance there was a need for a 3D array of register
1897       * files, we would have to check whether Dimension is followed
1898       * by a dimension register and continue the saga.
1899       */
1900   } else {
1901      index2D.i[0] =
1902      index2D.i[1] =
1903      index2D.i[2] =
1904      index2D.i[3] = 0;
1905   }
1906
1907   switch (reg->Register.File) {
1908   case TGSI_FILE_NULL:
1909      dst = &null;
1910      break;
1911
1912   case TGSI_FILE_OUTPUT:
1913      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1914         + reg->Register.Index;
1915      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1916#if 0
1917      debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
1918                   mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
1919                   reg->Register.Index);
1920      if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1921         debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1922         for (i = 0; i < TGSI_QUAD_SIZE; i++)
1923            if (execmask & (1 << i))
1924               debug_printf("%f, ", chan->f[i]);
1925         debug_printf(")\n");
1926      }
1927#endif
1928      break;
1929
1930   case TGSI_FILE_TEMPORARY:
1931      index = reg->Register.Index;
1932      assert( index < TGSI_EXEC_NUM_TEMPS );
1933      dst = &mach->Temps[offset + index].xyzw[chan_index];
1934      break;
1935
1936   case TGSI_FILE_ADDRESS:
1937      index = reg->Register.Index;
1938      dst = &mach->Addrs[index].xyzw[chan_index];
1939      break;
1940
1941   case TGSI_FILE_PREDICATE:
1942      index = reg->Register.Index;
1943      assert(index < TGSI_EXEC_NUM_PREDS);
1944      dst = &mach->Predicates[index].xyzw[chan_index];
1945      break;
1946
1947   default:
1948      assert( 0 );
1949      return NULL;
1950   }
1951
1952   if (inst->Instruction.Predicate) {
1953      uint swizzle;
1954      union tgsi_exec_channel *pred;
1955
1956      switch (chan_index) {
1957      case TGSI_CHAN_X:
1958         swizzle = inst->Predicate.SwizzleX;
1959         break;
1960      case TGSI_CHAN_Y:
1961         swizzle = inst->Predicate.SwizzleY;
1962         break;
1963      case TGSI_CHAN_Z:
1964         swizzle = inst->Predicate.SwizzleZ;
1965         break;
1966      case TGSI_CHAN_W:
1967         swizzle = inst->Predicate.SwizzleW;
1968         break;
1969      default:
1970         assert(0);
1971         return NULL;
1972      }
1973
1974      assert(inst->Predicate.Index == 0);
1975
1976      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1977
1978      if (inst->Predicate.Negate) {
1979         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1980            if (pred->u[i]) {
1981               execmask &= ~(1 << i);
1982            }
1983         }
1984      } else {
1985         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1986            if (!pred->u[i]) {
1987               execmask &= ~(1 << i);
1988            }
1989         }
1990      }
1991   }
1992
1993   return dst;
1994}
1995
1996static void
1997store_dest_double(struct tgsi_exec_machine *mach,
1998                 const union tgsi_exec_channel *chan,
1999                 const struct tgsi_full_dst_register *reg,
2000                 const struct tgsi_full_instruction *inst,
2001                 uint chan_index,
2002                 enum tgsi_exec_datatype dst_datatype)
2003{
2004   union tgsi_exec_channel *dst;
2005   const uint execmask = mach->ExecMask;
2006   int i;
2007
2008   dst = store_dest_dstret(mach, chan, reg, inst, chan_index,
2009			   dst_datatype);
2010   if (!dst)
2011      return;
2012
2013   /* doubles path */
2014   for (i = 0; i < TGSI_QUAD_SIZE; i++)
2015      if (execmask & (1 << i))
2016         dst->i[i] = chan->i[i];
2017}
2018
2019static void
2020store_dest(struct tgsi_exec_machine *mach,
2021           const union tgsi_exec_channel *chan,
2022           const struct tgsi_full_dst_register *reg,
2023           const struct tgsi_full_instruction *inst,
2024           uint chan_index,
2025           enum tgsi_exec_datatype dst_datatype)
2026{
2027   union tgsi_exec_channel *dst;
2028   const uint execmask = mach->ExecMask;
2029   int i;
2030
2031   dst = store_dest_dstret(mach, chan, reg, inst, chan_index,
2032                    dst_datatype);
2033   if (!dst)
2034      return;
2035
2036   if (!inst->Instruction.Saturate) {
2037      for (i = 0; i < TGSI_QUAD_SIZE; i++)
2038         if (execmask & (1 << i))
2039            dst->i[i] = chan->i[i];
2040   }
2041   else {
2042      for (i = 0; i < TGSI_QUAD_SIZE; i++)
2043         if (execmask & (1 << i)) {
2044            if (chan->f[i] < 0.0f)
2045               dst->f[i] = 0.0f;
2046            else if (chan->f[i] > 1.0f)
2047               dst->f[i] = 1.0f;
2048            else
2049               dst->i[i] = chan->i[i];
2050         }
2051   }
2052}
2053
2054#define FETCH(VAL,INDEX,CHAN)\
2055    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
2056
2057#define IFETCH(VAL,INDEX,CHAN)\
2058    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
2059
2060
2061/**
2062 * Execute ARB-style KIL which is predicated by a src register.
2063 * Kill fragment if any of the four values is less than zero.
2064 */
2065static void
2066exec_kill_if(struct tgsi_exec_machine *mach,
2067             const struct tgsi_full_instruction *inst)
2068{
2069   uint uniquemask;
2070   uint chan_index;
2071   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2072   union tgsi_exec_channel r[1];
2073
2074   /* This mask stores component bits that were already tested. */
2075   uniquemask = 0;
2076
2077   for (chan_index = 0; chan_index < 4; chan_index++)
2078   {
2079      uint swizzle;
2080      uint i;
2081
2082      /* unswizzle channel */
2083      swizzle = tgsi_util_get_full_src_register_swizzle (
2084                        &inst->Src[0],
2085                        chan_index);
2086
2087      /* check if the component has not been already tested */
2088      if (uniquemask & (1 << swizzle))
2089         continue;
2090      uniquemask |= 1 << swizzle;
2091
2092      FETCH(&r[0], 0, chan_index);
2093      for (i = 0; i < 4; i++)
2094         if (r[0].f[i] < 0.0f)
2095            kilmask |= 1 << i;
2096   }
2097
2098   /* restrict to fragments currently executing */
2099   kilmask &= mach->ExecMask;
2100
2101   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2102}
2103
2104/**
2105 * Unconditional fragment kill/discard.
2106 */
2107static void
2108exec_kill(struct tgsi_exec_machine *mach,
2109          const struct tgsi_full_instruction *inst)
2110{
2111   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2112
2113   /* kill fragment for all fragments currently executing */
2114   kilmask = mach->ExecMask;
2115   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2116}
2117
2118static void
2119emit_vertex(struct tgsi_exec_machine *mach)
2120{
2121   /* FIXME: check for exec mask correctly
2122   unsigned i;
2123   for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2124         if ((mach->ExecMask & (1 << i)))
2125   */
2126   if (mach->ExecMask) {
2127      if (mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] >= mach->MaxOutputVertices)
2128         return;
2129
2130      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
2131      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
2132   }
2133}
2134
2135static void
2136emit_primitive(struct tgsi_exec_machine *mach)
2137{
2138   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
2139   /* FIXME: check for exec mask correctly
2140   unsigned i;
2141   for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2142         if ((mach->ExecMask & (1 << i)))
2143   */
2144   if (mach->ExecMask) {
2145      ++(*prim_count);
2146      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
2147      mach->Primitives[*prim_count] = 0;
2148   }
2149}
2150
2151static void
2152conditional_emit_primitive(struct tgsi_exec_machine *mach)
2153{
2154   if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
2155      int emitted_verts =
2156         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
2157      if (emitted_verts) {
2158         emit_primitive(mach);
2159      }
2160   }
2161}
2162
2163
2164/*
2165 * Fetch four texture samples using STR texture coordinates.
2166 */
2167static void
2168fetch_texel( struct tgsi_sampler *sampler,
2169             const unsigned sview_idx,
2170             const unsigned sampler_idx,
2171             const union tgsi_exec_channel *s,
2172             const union tgsi_exec_channel *t,
2173             const union tgsi_exec_channel *p,
2174             const union tgsi_exec_channel *c0,
2175             const union tgsi_exec_channel *c1,
2176             float derivs[3][2][TGSI_QUAD_SIZE],
2177             const int8_t offset[3],
2178             enum tgsi_sampler_control control,
2179             union tgsi_exec_channel *r,
2180             union tgsi_exec_channel *g,
2181             union tgsi_exec_channel *b,
2182             union tgsi_exec_channel *a )
2183{
2184   uint j;
2185   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2186
2187   /* FIXME: handle explicit derivs, offsets */
2188   sampler->get_samples(sampler, sview_idx, sampler_idx,
2189                        s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
2190
2191   for (j = 0; j < 4; j++) {
2192      r->f[j] = rgba[0][j];
2193      g->f[j] = rgba[1][j];
2194      b->f[j] = rgba[2][j];
2195      a->f[j] = rgba[3][j];
2196   }
2197}
2198
2199
2200#define TEX_MODIFIER_NONE           0
2201#define TEX_MODIFIER_PROJECTED      1
2202#define TEX_MODIFIER_LOD_BIAS       2
2203#define TEX_MODIFIER_EXPLICIT_LOD   3
2204#define TEX_MODIFIER_LEVEL_ZERO     4
2205#define TEX_MODIFIER_GATHER         5
2206
2207/*
2208 * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
2209 */
2210static void
2211fetch_texel_offsets(struct tgsi_exec_machine *mach,
2212                    const struct tgsi_full_instruction *inst,
2213                    int8_t offsets[3])
2214{
2215   if (inst->Texture.NumOffsets == 1) {
2216      union tgsi_exec_channel index;
2217      union tgsi_exec_channel offset[3];
2218      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2219      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2220                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2221      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2222                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2223      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
2224                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2225     offsets[0] = offset[0].i[0];
2226     offsets[1] = offset[1].i[0];
2227     offsets[2] = offset[2].i[0];
2228   } else {
2229     assert(inst->Texture.NumOffsets == 0);
2230     offsets[0] = offsets[1] = offsets[2] = 0;
2231   }
2232}
2233
2234
2235/*
2236 * Fetch dx and dy values for one channel (s, t or r).
2237 * Put dx values into one float array, dy values into another.
2238 */
2239static void
2240fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
2241                           const struct tgsi_full_instruction *inst,
2242                           unsigned regdsrcx,
2243                           unsigned chan,
2244                           float derivs[2][TGSI_QUAD_SIZE])
2245{
2246   union tgsi_exec_channel d;
2247   FETCH(&d, regdsrcx, chan);
2248   derivs[0][0] = d.f[0];
2249   derivs[0][1] = d.f[1];
2250   derivs[0][2] = d.f[2];
2251   derivs[0][3] = d.f[3];
2252   FETCH(&d, regdsrcx + 1, chan);
2253   derivs[1][0] = d.f[0];
2254   derivs[1][1] = d.f[1];
2255   derivs[1][2] = d.f[2];
2256   derivs[1][3] = d.f[3];
2257}
2258
2259static uint
2260fetch_sampler_unit(struct tgsi_exec_machine *mach,
2261                   const struct tgsi_full_instruction *inst,
2262                   uint sampler)
2263{
2264   uint unit = 0;
2265   int i;
2266   if (inst->Src[sampler].Register.Indirect) {
2267      const struct tgsi_full_src_register *reg = &inst->Src[sampler];
2268      union tgsi_exec_channel indir_index, index2;
2269      const uint execmask = mach->ExecMask;
2270      index2.i[0] =
2271      index2.i[1] =
2272      index2.i[2] =
2273      index2.i[3] = reg->Indirect.Index;
2274
2275      fetch_src_file_channel(mach,
2276                             0,
2277                             reg->Indirect.File,
2278                             reg->Indirect.Swizzle,
2279                             &index2,
2280                             &ZeroVec,
2281                             &indir_index);
2282      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2283         if (execmask & (1 << i)) {
2284            unit = inst->Src[sampler].Register.Index + indir_index.i[i];
2285            break;
2286         }
2287      }
2288
2289   } else {
2290      unit = inst->Src[sampler].Register.Index;
2291   }
2292   return unit;
2293}
2294
2295/*
2296 * execute a texture instruction.
2297 *
2298 * modifier is used to control the channel routing for the
2299 * instruction variants like proj, lod, and texture with lod bias.
2300 * sampler indicates which src register the sampler is contained in.
2301 */
2302static void
2303exec_tex(struct tgsi_exec_machine *mach,
2304         const struct tgsi_full_instruction *inst,
2305         uint modifier, uint sampler)
2306{
2307   const union tgsi_exec_channel *args[5], *proj = NULL;
2308   union tgsi_exec_channel r[5];
2309   enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2310   uint chan;
2311   uint unit;
2312   int8_t offsets[3];
2313   int dim, shadow_ref, i;
2314
2315   unit = fetch_sampler_unit(mach, inst, sampler);
2316   /* always fetch all 3 offsets, overkill but keeps code simple */
2317   fetch_texel_offsets(mach, inst, offsets);
2318
2319   assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
2320   assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
2321
2322   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2323   shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
2324
2325   assert(dim <= 4);
2326   if (shadow_ref >= 0)
2327      assert(shadow_ref >= dim && shadow_ref < ARRAY_SIZE(args));
2328
2329   /* fetch modifier to the last argument */
2330   if (modifier != TEX_MODIFIER_NONE) {
2331      const int last = ARRAY_SIZE(args) - 1;
2332
2333      /* fetch modifier from src0.w or src1.x */
2334      if (sampler == 1) {
2335         assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
2336         FETCH(&r[last], 0, TGSI_CHAN_W);
2337      }
2338      else {
2339         assert(shadow_ref != 4);
2340         FETCH(&r[last], 1, TGSI_CHAN_X);
2341      }
2342
2343      if (modifier != TEX_MODIFIER_PROJECTED) {
2344         args[last] = &r[last];
2345      }
2346      else {
2347         proj = &r[last];
2348         args[last] = &ZeroVec;
2349      }
2350
2351      /* point unused arguments to zero vector */
2352      for (i = dim; i < last; i++)
2353         args[i] = &ZeroVec;
2354
2355      if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
2356         control = TGSI_SAMPLER_LOD_EXPLICIT;
2357      else if (modifier == TEX_MODIFIER_LOD_BIAS)
2358         control = TGSI_SAMPLER_LOD_BIAS;
2359      else if (modifier == TEX_MODIFIER_GATHER)
2360         control = TGSI_SAMPLER_GATHER;
2361   }
2362   else {
2363      for (i = dim; i < ARRAY_SIZE(args); i++)
2364         args[i] = &ZeroVec;
2365   }
2366
2367   /* fetch coordinates */
2368   for (i = 0; i < dim; i++) {
2369      FETCH(&r[i], 0, TGSI_CHAN_X + i);
2370
2371      if (proj)
2372         micro_div(&r[i], &r[i], proj);
2373
2374      args[i] = &r[i];
2375   }
2376
2377   /* fetch reference value */
2378   if (shadow_ref >= 0) {
2379      FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
2380
2381      if (proj)
2382         micro_div(&r[shadow_ref], &r[shadow_ref], proj);
2383
2384      args[shadow_ref] = &r[shadow_ref];
2385   }
2386
2387   fetch_texel(mach->Sampler, unit, unit,
2388         args[0], args[1], args[2], args[3], args[4],
2389         NULL, offsets, control,
2390         &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2391
2392#if 0
2393   debug_printf("fetch r: %g %g %g %g\n",
2394         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
2395   debug_printf("fetch g: %g %g %g %g\n",
2396         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
2397   debug_printf("fetch b: %g %g %g %g\n",
2398         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
2399   debug_printf("fetch a: %g %g %g %g\n",
2400         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
2401#endif
2402
2403   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2404      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2405         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2406      }
2407   }
2408}
2409
2410static void
2411exec_lodq(struct tgsi_exec_machine *mach,
2412          const struct tgsi_full_instruction *inst)
2413{
2414   uint unit;
2415   int dim;
2416   int i;
2417   union tgsi_exec_channel coords[4];
2418   const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
2419   union tgsi_exec_channel r[2];
2420
2421   unit = fetch_sampler_unit(mach, inst, 1);
2422   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2423   assert(dim <= ARRAY_SIZE(coords));
2424   /* fetch coordinates */
2425   for (i = 0; i < dim; i++) {
2426      FETCH(&coords[i], 0, TGSI_CHAN_X + i);
2427      args[i] = &coords[i];
2428   }
2429   for (i = dim; i < ARRAY_SIZE(coords); i++) {
2430      args[i] = &ZeroVec;
2431   }
2432   mach->Sampler->query_lod(mach->Sampler, unit, unit,
2433                            args[0]->f,
2434                            args[1]->f,
2435                            args[2]->f,
2436                            args[3]->f,
2437                            TGSI_SAMPLER_LOD_NONE,
2438                            r[0].f,
2439                            r[1].f);
2440
2441   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2442      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2443                 TGSI_EXEC_DATA_FLOAT);
2444   }
2445   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2446      store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2447                 TGSI_EXEC_DATA_FLOAT);
2448   }
2449}
2450
2451static void
2452exec_txd(struct tgsi_exec_machine *mach,
2453         const struct tgsi_full_instruction *inst)
2454{
2455   union tgsi_exec_channel r[4];
2456   float derivs[3][2][TGSI_QUAD_SIZE];
2457   uint chan;
2458   uint unit;
2459   int8_t offsets[3];
2460
2461   unit = fetch_sampler_unit(mach, inst, 3);
2462   /* always fetch all 3 offsets, overkill but keeps code simple */
2463   fetch_texel_offsets(mach, inst, offsets);
2464
2465   switch (inst->Texture.Texture) {
2466   case TGSI_TEXTURE_1D:
2467      FETCH(&r[0], 0, TGSI_CHAN_X);
2468
2469      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2470
2471      fetch_texel(mach->Sampler, unit, unit,
2472                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2473                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2474                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2475      break;
2476
2477   case TGSI_TEXTURE_SHADOW1D:
2478   case TGSI_TEXTURE_1D_ARRAY:
2479   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2480      /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
2481      FETCH(&r[0], 0, TGSI_CHAN_X);
2482      FETCH(&r[1], 0, TGSI_CHAN_Y);
2483      FETCH(&r[2], 0, TGSI_CHAN_Z);
2484
2485      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2486
2487      fetch_texel(mach->Sampler, unit, unit,
2488                  &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2489                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2490                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2491      break;
2492
2493   case TGSI_TEXTURE_2D:
2494   case TGSI_TEXTURE_RECT:
2495      FETCH(&r[0], 0, TGSI_CHAN_X);
2496      FETCH(&r[1], 0, TGSI_CHAN_Y);
2497
2498      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2499      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2500
2501      fetch_texel(mach->Sampler, unit, unit,
2502                  &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2503                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2504                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2505      break;
2506
2507
2508   case TGSI_TEXTURE_SHADOW2D:
2509   case TGSI_TEXTURE_SHADOWRECT:
2510   case TGSI_TEXTURE_2D_ARRAY:
2511   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2512      /* only SHADOW2D_ARRAY actually needs W */
2513      FETCH(&r[0], 0, TGSI_CHAN_X);
2514      FETCH(&r[1], 0, TGSI_CHAN_Y);
2515      FETCH(&r[2], 0, TGSI_CHAN_Z);
2516      FETCH(&r[3], 0, TGSI_CHAN_W);
2517
2518      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2519      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2520
2521      fetch_texel(mach->Sampler, unit, unit,
2522                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2523                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2524                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2525      break;
2526
2527   case TGSI_TEXTURE_3D:
2528   case TGSI_TEXTURE_CUBE:
2529   case TGSI_TEXTURE_CUBE_ARRAY:
2530   case TGSI_TEXTURE_SHADOWCUBE:
2531      /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
2532      FETCH(&r[0], 0, TGSI_CHAN_X);
2533      FETCH(&r[1], 0, TGSI_CHAN_Y);
2534      FETCH(&r[2], 0, TGSI_CHAN_Z);
2535      FETCH(&r[3], 0, TGSI_CHAN_W);
2536
2537      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2538      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2539      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
2540
2541      fetch_texel(mach->Sampler, unit, unit,
2542                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2543                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2544                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2545      break;
2546
2547   default:
2548      assert(0);
2549   }
2550
2551   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2552      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2553         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2554      }
2555   }
2556}
2557
2558
2559static void
2560exec_txf(struct tgsi_exec_machine *mach,
2561         const struct tgsi_full_instruction *inst)
2562{
2563   union tgsi_exec_channel r[4];
2564   uint chan;
2565   uint unit;
2566   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2567   int j;
2568   int8_t offsets[3];
2569   unsigned target;
2570
2571   unit = fetch_sampler_unit(mach, inst, 1);
2572   /* always fetch all 3 offsets, overkill but keeps code simple */
2573   fetch_texel_offsets(mach, inst, offsets);
2574
2575   IFETCH(&r[3], 0, TGSI_CHAN_W);
2576
2577   if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2578       inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2579      target = mach->SamplerViews[unit].Resource;
2580   }
2581   else {
2582      target = inst->Texture.Texture;
2583   }
2584   switch(target) {
2585   case TGSI_TEXTURE_3D:
2586   case TGSI_TEXTURE_2D_ARRAY:
2587   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2588   case TGSI_TEXTURE_2D_ARRAY_MSAA:
2589      IFETCH(&r[2], 0, TGSI_CHAN_Z);
2590      /* fallthrough */
2591   case TGSI_TEXTURE_2D:
2592   case TGSI_TEXTURE_RECT:
2593   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2594   case TGSI_TEXTURE_SHADOW2D:
2595   case TGSI_TEXTURE_SHADOWRECT:
2596   case TGSI_TEXTURE_1D_ARRAY:
2597   case TGSI_TEXTURE_2D_MSAA:
2598      IFETCH(&r[1], 0, TGSI_CHAN_Y);
2599      /* fallthrough */
2600   case TGSI_TEXTURE_BUFFER:
2601   case TGSI_TEXTURE_1D:
2602   case TGSI_TEXTURE_SHADOW1D:
2603      IFETCH(&r[0], 0, TGSI_CHAN_X);
2604      break;
2605   default:
2606      assert(0);
2607      break;
2608   }
2609
2610   mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
2611                            offsets, rgba);
2612
2613   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2614      r[0].f[j] = rgba[0][j];
2615      r[1].f[j] = rgba[1][j];
2616      r[2].f[j] = rgba[2][j];
2617      r[3].f[j] = rgba[3][j];
2618   }
2619
2620   if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2621       inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2622      unsigned char swizzles[4];
2623      swizzles[0] = inst->Src[1].Register.SwizzleX;
2624      swizzles[1] = inst->Src[1].Register.SwizzleY;
2625      swizzles[2] = inst->Src[1].Register.SwizzleZ;
2626      swizzles[3] = inst->Src[1].Register.SwizzleW;
2627
2628      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2629         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2630            store_dest(mach, &r[swizzles[chan]],
2631                       &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2632         }
2633      }
2634   }
2635   else {
2636      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2637         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2638            store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2639         }
2640      }
2641   }
2642}
2643
2644static void
2645exec_txq(struct tgsi_exec_machine *mach,
2646         const struct tgsi_full_instruction *inst)
2647{
2648   int result[4];
2649   union tgsi_exec_channel r[4], src;
2650   uint chan;
2651   uint unit;
2652   int i,j;
2653
2654   unit = fetch_sampler_unit(mach, inst, 1);
2655
2656   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2657
2658   /* XXX: This interface can't return per-pixel values */
2659   mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
2660
2661   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2662      for (j = 0; j < 4; j++) {
2663         r[j].i[i] = result[j];
2664      }
2665   }
2666
2667   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2668      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2669         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2670                    TGSI_EXEC_DATA_INT);
2671      }
2672   }
2673}
2674
2675static void
2676exec_sample(struct tgsi_exec_machine *mach,
2677            const struct tgsi_full_instruction *inst,
2678            uint modifier, boolean compare)
2679{
2680   const uint resource_unit = inst->Src[1].Register.Index;
2681   const uint sampler_unit = inst->Src[2].Register.Index;
2682   union tgsi_exec_channel r[5], c1;
2683   const union tgsi_exec_channel *lod = &ZeroVec;
2684   enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2685   uint chan;
2686   unsigned char swizzles[4];
2687   int8_t offsets[3];
2688
2689   /* always fetch all 3 offsets, overkill but keeps code simple */
2690   fetch_texel_offsets(mach, inst, offsets);
2691
2692   assert(modifier != TEX_MODIFIER_PROJECTED);
2693
2694   if (modifier != TEX_MODIFIER_NONE) {
2695      if (modifier == TEX_MODIFIER_LOD_BIAS) {
2696         FETCH(&c1, 3, TGSI_CHAN_X);
2697         lod = &c1;
2698         control = TGSI_SAMPLER_LOD_BIAS;
2699      }
2700      else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2701         FETCH(&c1, 3, TGSI_CHAN_X);
2702         lod = &c1;
2703         control = TGSI_SAMPLER_LOD_EXPLICIT;
2704      }
2705      else {
2706         assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
2707         control = TGSI_SAMPLER_LOD_ZERO;
2708      }
2709   }
2710
2711   FETCH(&r[0], 0, TGSI_CHAN_X);
2712
2713   switch (mach->SamplerViews[resource_unit].Resource) {
2714   case TGSI_TEXTURE_1D:
2715      if (compare) {
2716         FETCH(&r[2], 3, TGSI_CHAN_X);
2717         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2718                     &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2719                     NULL, offsets, control,
2720                     &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2721      }
2722      else {
2723         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2724                     &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2725                     NULL, offsets, control,
2726                     &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2727      }
2728      break;
2729
2730   case TGSI_TEXTURE_1D_ARRAY:
2731   case TGSI_TEXTURE_2D:
2732   case TGSI_TEXTURE_RECT:
2733      FETCH(&r[1], 0, TGSI_CHAN_Y);
2734      if (compare) {
2735         FETCH(&r[2], 3, TGSI_CHAN_X);
2736         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2737                     &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
2738                     NULL, offsets, control,
2739                     &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2740      }
2741      else {
2742         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2743                     &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
2744                     NULL, offsets, control,
2745                     &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2746      }
2747      break;
2748
2749   case TGSI_TEXTURE_2D_ARRAY:
2750   case TGSI_TEXTURE_3D:
2751   case TGSI_TEXTURE_CUBE:
2752      FETCH(&r[1], 0, TGSI_CHAN_Y);
2753      FETCH(&r[2], 0, TGSI_CHAN_Z);
2754      if(compare) {
2755         FETCH(&r[3], 3, TGSI_CHAN_X);
2756         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2757                     &r[0], &r[1], &r[2], &r[3], lod,
2758                     NULL, offsets, control,
2759                     &r[0], &r[1], &r[2], &r[3]);
2760      }
2761      else {
2762         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2763                     &r[0], &r[1], &r[2], &ZeroVec, lod,
2764                     NULL, offsets, control,
2765                     &r[0], &r[1], &r[2], &r[3]);
2766      }
2767      break;
2768
2769   case TGSI_TEXTURE_CUBE_ARRAY:
2770      FETCH(&r[1], 0, TGSI_CHAN_Y);
2771      FETCH(&r[2], 0, TGSI_CHAN_Z);
2772      FETCH(&r[3], 0, TGSI_CHAN_W);
2773      if(compare) {
2774         FETCH(&r[4], 3, TGSI_CHAN_X);
2775         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2776                     &r[0], &r[1], &r[2], &r[3], &r[4],
2777                     NULL, offsets, control,
2778                     &r[0], &r[1], &r[2], &r[3]);
2779      }
2780      else {
2781         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2782                     &r[0], &r[1], &r[2], &r[3], lod,
2783                     NULL, offsets, control,
2784                     &r[0], &r[1], &r[2], &r[3]);
2785      }
2786      break;
2787
2788
2789   default:
2790      assert(0);
2791   }
2792
2793   swizzles[0] = inst->Src[1].Register.SwizzleX;
2794   swizzles[1] = inst->Src[1].Register.SwizzleY;
2795   swizzles[2] = inst->Src[1].Register.SwizzleZ;
2796   swizzles[3] = inst->Src[1].Register.SwizzleW;
2797
2798   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2799      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2800         store_dest(mach, &r[swizzles[chan]],
2801                    &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2802      }
2803   }
2804}
2805
2806static void
2807exec_sample_d(struct tgsi_exec_machine *mach,
2808              const struct tgsi_full_instruction *inst)
2809{
2810   const uint resource_unit = inst->Src[1].Register.Index;
2811   const uint sampler_unit = inst->Src[2].Register.Index;
2812   union tgsi_exec_channel r[4];
2813   float derivs[3][2][TGSI_QUAD_SIZE];
2814   uint chan;
2815   unsigned char swizzles[4];
2816   int8_t offsets[3];
2817
2818   /* always fetch all 3 offsets, overkill but keeps code simple */
2819   fetch_texel_offsets(mach, inst, offsets);
2820
2821   FETCH(&r[0], 0, TGSI_CHAN_X);
2822
2823   switch (mach->SamplerViews[resource_unit].Resource) {
2824   case TGSI_TEXTURE_1D:
2825   case TGSI_TEXTURE_1D_ARRAY:
2826      /* only 1D array actually needs Y */
2827      FETCH(&r[1], 0, TGSI_CHAN_Y);
2828
2829      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2830
2831      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2832                  &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2833                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2834                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2835      break;
2836
2837   case TGSI_TEXTURE_2D:
2838   case TGSI_TEXTURE_RECT:
2839   case TGSI_TEXTURE_2D_ARRAY:
2840      /* only 2D array actually needs Z */
2841      FETCH(&r[1], 0, TGSI_CHAN_Y);
2842      FETCH(&r[2], 0, TGSI_CHAN_Z);
2843
2844      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2845      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2846
2847      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2848                  &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
2849                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2850                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2851      break;
2852
2853   case TGSI_TEXTURE_3D:
2854   case TGSI_TEXTURE_CUBE:
2855   case TGSI_TEXTURE_CUBE_ARRAY:
2856      /* only cube array actually needs W */
2857      FETCH(&r[1], 0, TGSI_CHAN_Y);
2858      FETCH(&r[2], 0, TGSI_CHAN_Z);
2859      FETCH(&r[3], 0, TGSI_CHAN_W);
2860
2861      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2862      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2863      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
2864
2865      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2866                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,
2867                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2868                  &r[0], &r[1], &r[2], &r[3]);
2869      break;
2870
2871   default:
2872      assert(0);
2873   }
2874
2875   swizzles[0] = inst->Src[1].Register.SwizzleX;
2876   swizzles[1] = inst->Src[1].Register.SwizzleY;
2877   swizzles[2] = inst->Src[1].Register.SwizzleZ;
2878   swizzles[3] = inst->Src[1].Register.SwizzleW;
2879
2880   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2881      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2882         store_dest(mach, &r[swizzles[chan]],
2883                    &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2884      }
2885   }
2886}
2887
2888
2889/**
2890 * Evaluate a constant-valued coefficient at the position of the
2891 * current quad.
2892 */
2893static void
2894eval_constant_coef(
2895   struct tgsi_exec_machine *mach,
2896   unsigned attrib,
2897   unsigned chan )
2898{
2899   unsigned i;
2900
2901   for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
2902      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2903   }
2904}
2905
2906/**
2907 * Evaluate a linear-valued coefficient at the position of the
2908 * current quad.
2909 */
2910static void
2911eval_linear_coef(
2912   struct tgsi_exec_machine *mach,
2913   unsigned attrib,
2914   unsigned chan )
2915{
2916   const float x = mach->QuadPos.xyzw[0].f[0];
2917   const float y = mach->QuadPos.xyzw[1].f[0];
2918   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2919   const float dady = mach->InterpCoefs[attrib].dady[chan];
2920   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2921   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2922   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2923   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2924   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2925}
2926
2927/**
2928 * Evaluate a perspective-valued coefficient at the position of the
2929 * current quad.
2930 */
2931static void
2932eval_perspective_coef(
2933   struct tgsi_exec_machine *mach,
2934   unsigned attrib,
2935   unsigned chan )
2936{
2937   const float x = mach->QuadPos.xyzw[0].f[0];
2938   const float y = mach->QuadPos.xyzw[1].f[0];
2939   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2940   const float dady = mach->InterpCoefs[attrib].dady[chan];
2941   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2942   const float *w = mach->QuadPos.xyzw[3].f;
2943   /* divide by W here */
2944   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2945   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2946   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2947   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2948}
2949
2950
2951typedef void (* eval_coef_func)(
2952   struct tgsi_exec_machine *mach,
2953   unsigned attrib,
2954   unsigned chan );
2955
2956static void
2957exec_declaration(struct tgsi_exec_machine *mach,
2958                 const struct tgsi_full_declaration *decl)
2959{
2960   if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
2961      mach->SamplerViews[decl->Range.First] = decl->SamplerView;
2962      return;
2963   }
2964
2965   if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
2966      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2967         uint first, last, mask;
2968
2969         first = decl->Range.First;
2970         last = decl->Range.Last;
2971         mask = decl->Declaration.UsageMask;
2972
2973         /* XXX we could remove this special-case code since
2974          * mach->InterpCoefs[first].a0 should already have the
2975          * front/back-face value.  But we should first update the
2976          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2977          * Then, we could remove the tgsi_exec_machine::Face field.
2978          */
2979         /* XXX make FACE a system value */
2980         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2981            uint i;
2982
2983            assert(decl->Semantic.Index == 0);
2984            assert(first == last);
2985
2986            for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2987               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2988            }
2989         } else {
2990            eval_coef_func eval;
2991            uint i, j;
2992
2993            switch (decl->Interp.Interpolate) {
2994            case TGSI_INTERPOLATE_CONSTANT:
2995               eval = eval_constant_coef;
2996               break;
2997
2998            case TGSI_INTERPOLATE_LINEAR:
2999               eval = eval_linear_coef;
3000               break;
3001
3002            case TGSI_INTERPOLATE_PERSPECTIVE:
3003               eval = eval_perspective_coef;
3004               break;
3005
3006            case TGSI_INTERPOLATE_COLOR:
3007               eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
3008               break;
3009
3010            default:
3011               assert(0);
3012               return;
3013            }
3014
3015            for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3016               if (mask & (1 << j)) {
3017                  for (i = first; i <= last; i++) {
3018                     eval(mach, i, j);
3019                  }
3020               }
3021            }
3022         }
3023
3024         if (DEBUG_EXECUTION) {
3025            uint i, j;
3026            for (i = first; i <= last; ++i) {
3027               debug_printf("IN[%2u] = ", i);
3028               for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3029                  if (j > 0) {
3030                     debug_printf("         ");
3031                  }
3032                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3033                               mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
3034                               mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
3035                               mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
3036                               mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
3037               }
3038            }
3039         }
3040      }
3041   }
3042
3043}
3044
3045typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
3046                                const union tgsi_exec_channel *src);
3047
3048static void
3049exec_scalar_unary(struct tgsi_exec_machine *mach,
3050                  const struct tgsi_full_instruction *inst,
3051                  micro_unary_op op,
3052                  enum tgsi_exec_datatype dst_datatype,
3053                  enum tgsi_exec_datatype src_datatype)
3054{
3055   unsigned int chan;
3056   union tgsi_exec_channel src;
3057   union tgsi_exec_channel dst;
3058
3059   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
3060   op(&dst, &src);
3061   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3062      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3063         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3064      }
3065   }
3066}
3067
3068static void
3069exec_vector_unary(struct tgsi_exec_machine *mach,
3070                  const struct tgsi_full_instruction *inst,
3071                  micro_unary_op op,
3072                  enum tgsi_exec_datatype dst_datatype,
3073                  enum tgsi_exec_datatype src_datatype)
3074{
3075   unsigned int chan;
3076   struct tgsi_exec_vector dst;
3077
3078   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3079      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3080         union tgsi_exec_channel src;
3081
3082         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
3083         op(&dst.xyzw[chan], &src);
3084      }
3085   }
3086   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3087      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3088         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3089      }
3090   }
3091}
3092
3093typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
3094                                 const union tgsi_exec_channel *src0,
3095                                 const union tgsi_exec_channel *src1);
3096
3097static void
3098exec_scalar_binary(struct tgsi_exec_machine *mach,
3099                   const struct tgsi_full_instruction *inst,
3100                   micro_binary_op op,
3101                   enum tgsi_exec_datatype dst_datatype,
3102                   enum tgsi_exec_datatype src_datatype)
3103{
3104   unsigned int chan;
3105   union tgsi_exec_channel src[2];
3106   union tgsi_exec_channel dst;
3107
3108   fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
3109   fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
3110   op(&dst, &src[0], &src[1]);
3111   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3112      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3113         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3114      }
3115   }
3116}
3117
3118static void
3119exec_vector_binary(struct tgsi_exec_machine *mach,
3120                   const struct tgsi_full_instruction *inst,
3121                   micro_binary_op op,
3122                   enum tgsi_exec_datatype dst_datatype,
3123                   enum tgsi_exec_datatype src_datatype)
3124{
3125   unsigned int chan;
3126   struct tgsi_exec_vector dst;
3127
3128   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3129      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3130         union tgsi_exec_channel src[2];
3131
3132         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3133         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3134         op(&dst.xyzw[chan], &src[0], &src[1]);
3135      }
3136   }
3137   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3138      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3139         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3140      }
3141   }
3142}
3143
3144typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
3145                                  const union tgsi_exec_channel *src0,
3146                                  const union tgsi_exec_channel *src1,
3147                                  const union tgsi_exec_channel *src2);
3148
3149static void
3150exec_vector_trinary(struct tgsi_exec_machine *mach,
3151                    const struct tgsi_full_instruction *inst,
3152                    micro_trinary_op op,
3153                    enum tgsi_exec_datatype dst_datatype,
3154                    enum tgsi_exec_datatype src_datatype)
3155{
3156   unsigned int chan;
3157   struct tgsi_exec_vector dst;
3158
3159   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3160      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3161         union tgsi_exec_channel src[3];
3162
3163         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3164         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3165         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3166         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3167      }
3168   }
3169   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3170      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3171         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3172      }
3173   }
3174}
3175
3176typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
3177                                     const union tgsi_exec_channel *src0,
3178                                     const union tgsi_exec_channel *src1,
3179                                     const union tgsi_exec_channel *src2,
3180                                     const union tgsi_exec_channel *src3);
3181
3182static void
3183exec_vector_quaternary(struct tgsi_exec_machine *mach,
3184                       const struct tgsi_full_instruction *inst,
3185                       micro_quaternary_op op,
3186                       enum tgsi_exec_datatype dst_datatype,
3187                       enum tgsi_exec_datatype src_datatype)
3188{
3189   unsigned int chan;
3190   struct tgsi_exec_vector dst;
3191
3192   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3193      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3194         union tgsi_exec_channel src[4];
3195
3196         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3197         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3198         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3199         fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
3200         op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
3201      }
3202   }
3203   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3204      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3205         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3206      }
3207   }
3208}
3209
3210static void
3211exec_dp3(struct tgsi_exec_machine *mach,
3212         const struct tgsi_full_instruction *inst)
3213{
3214   unsigned int chan;
3215   union tgsi_exec_channel arg[3];
3216
3217   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3218   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3219   micro_mul(&arg[2], &arg[0], &arg[1]);
3220
3221   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
3222      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3223      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3224      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3225   }
3226
3227   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3228      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3229         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3230      }
3231   }
3232}
3233
3234static void
3235exec_dp4(struct tgsi_exec_machine *mach,
3236         const struct tgsi_full_instruction *inst)
3237{
3238   unsigned int chan;
3239   union tgsi_exec_channel arg[3];
3240
3241   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3242   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3243   micro_mul(&arg[2], &arg[0], &arg[1]);
3244
3245   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
3246      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3247      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3248      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3249   }
3250
3251   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3252      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3253         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3254      }
3255   }
3256}
3257
3258static void
3259exec_dp2a(struct tgsi_exec_machine *mach,
3260          const struct tgsi_full_instruction *inst)
3261{
3262   unsigned int chan;
3263   union tgsi_exec_channel arg[3];
3264
3265   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3266   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3267   micro_mul(&arg[2], &arg[0], &arg[1]);
3268
3269   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3270   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3271   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
3272
3273   fetch_source(mach, &arg[1], &inst->Src[2], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3274   micro_add(&arg[0], &arg[0], &arg[1]);
3275
3276   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3277      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3278         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3279      }
3280   }
3281}
3282
3283static void
3284exec_dph(struct tgsi_exec_machine *mach,
3285         const struct tgsi_full_instruction *inst)
3286{
3287   unsigned int chan;
3288   union tgsi_exec_channel arg[3];
3289
3290   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3291   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3292   micro_mul(&arg[2], &arg[0], &arg[1]);
3293
3294   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3295   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3296   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3297
3298   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3299   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3300   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
3301
3302   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3303   micro_add(&arg[0], &arg[0], &arg[1]);
3304
3305   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3306      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3307         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3308      }
3309   }
3310}
3311
3312static void
3313exec_dp2(struct tgsi_exec_machine *mach,
3314         const struct tgsi_full_instruction *inst)
3315{
3316   unsigned int chan;
3317   union tgsi_exec_channel arg[3];
3318
3319   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3320   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3321   micro_mul(&arg[2], &arg[0], &arg[1]);
3322
3323   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3324   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3325   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3326
3327   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3328      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3329         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3330      }
3331   }
3332}
3333
3334static void
3335exec_pk2h(struct tgsi_exec_machine *mach,
3336          const struct tgsi_full_instruction *inst)
3337{
3338   unsigned chan;
3339   union tgsi_exec_channel arg[2], dst;
3340
3341   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3342   fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3343   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3344      dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
3345         (util_float_to_half(arg[1].f[chan]) << 16);
3346   }
3347   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3348      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3349         store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
3350      }
3351   }
3352}
3353
3354static void
3355exec_up2h(struct tgsi_exec_machine *mach,
3356          const struct tgsi_full_instruction *inst)
3357{
3358   unsigned chan;
3359   union tgsi_exec_channel arg, dst[2];
3360
3361   fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3362   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3363      dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
3364      dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
3365   }
3366   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3367      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3368         store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3369      }
3370   }
3371}
3372
3373static void
3374exec_scs(struct tgsi_exec_machine *mach,
3375         const struct tgsi_full_instruction *inst)
3376{
3377   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
3378      union tgsi_exec_channel arg;
3379      union tgsi_exec_channel result;
3380
3381      fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3382
3383      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3384         micro_cos(&result, &arg);
3385         store_dest(mach, &result, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3386      }
3387      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3388         micro_sin(&result, &arg);
3389         store_dest(mach, &result, &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3390      }
3391   }
3392   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3393      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3394   }
3395   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3396      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3397   }
3398}
3399
3400static void
3401exec_xpd(struct tgsi_exec_machine *mach,
3402         const struct tgsi_full_instruction *inst)
3403{
3404   union tgsi_exec_channel r[6];
3405   union tgsi_exec_channel d[3];
3406
3407   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3408   fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3409
3410   micro_mul(&r[2], &r[0], &r[1]);
3411
3412   fetch_source(mach, &r[3], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3413   fetch_source(mach, &r[4], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3414
3415   micro_mul(&r[5], &r[3], &r[4] );
3416   micro_sub(&d[TGSI_CHAN_X], &r[2], &r[5]);
3417
3418   fetch_source(mach, &r[2], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3419
3420   micro_mul(&r[3], &r[3], &r[2]);
3421
3422   fetch_source(mach, &r[5], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3423
3424   micro_mul(&r[1], &r[1], &r[5]);
3425   micro_sub(&d[TGSI_CHAN_Y], &r[3], &r[1]);
3426
3427   micro_mul(&r[5], &r[5], &r[4]);
3428   micro_mul(&r[0], &r[0], &r[2]);
3429   micro_sub(&d[TGSI_CHAN_Z], &r[5], &r[0]);
3430
3431   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3432      store_dest(mach, &d[TGSI_CHAN_X], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3433   }
3434   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3435      store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3436   }
3437   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3438      store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3439   }
3440   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3441      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3442   }
3443}
3444
3445static void
3446exec_dst(struct tgsi_exec_machine *mach,
3447         const struct tgsi_full_instruction *inst)
3448{
3449   union tgsi_exec_channel r[2];
3450   union tgsi_exec_channel d[4];
3451
3452   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3453      fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3454      fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3455      micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
3456   }
3457   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3458      fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3459   }
3460   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3461      fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3462   }
3463
3464   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3465      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3466   }
3467   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3468      store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3469   }
3470   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3471      store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3472   }
3473   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3474      store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3475   }
3476}
3477
3478static void
3479exec_log(struct tgsi_exec_machine *mach,
3480         const struct tgsi_full_instruction *inst)
3481{
3482   union tgsi_exec_channel r[3];
3483
3484   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3485   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
3486   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
3487   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
3488   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3489      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3490   }
3491   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3492      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
3493      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
3494      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3495   }
3496   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3497      store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3498   }
3499   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3500      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3501   }
3502}
3503
3504static void
3505exec_exp(struct tgsi_exec_machine *mach,
3506         const struct tgsi_full_instruction *inst)
3507{
3508   union tgsi_exec_channel r[3];
3509
3510   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3511   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
3512   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3513      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
3514      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3515   }
3516   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3517      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
3518      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3519   }
3520   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3521      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
3522      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3523   }
3524   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3525      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3526   }
3527}
3528
3529static void
3530exec_lit(struct tgsi_exec_machine *mach,
3531         const struct tgsi_full_instruction *inst)
3532{
3533   union tgsi_exec_channel r[3];
3534   union tgsi_exec_channel d[3];
3535
3536   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3537      fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3538      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3539         fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3540         micro_max(&r[1], &r[1], &ZeroVec);
3541
3542         fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3543         micro_min(&r[2], &r[2], &P128Vec);
3544         micro_max(&r[2], &r[2], &M128Vec);
3545         micro_pow(&r[1], &r[1], &r[2]);
3546         micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3547         store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3548      }
3549      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3550         micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
3551         store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3552      }
3553   }
3554   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3555      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3556   }
3557
3558   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3559      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3560   }
3561}
3562
3563static void
3564exec_break(struct tgsi_exec_machine *mach)
3565{
3566   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3567      /* turn off loop channels for each enabled exec channel */
3568      mach->LoopMask &= ~mach->ExecMask;
3569      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3570      UPDATE_EXEC_MASK(mach);
3571   } else {
3572      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3573
3574      mach->Switch.mask = 0x0;
3575
3576      UPDATE_EXEC_MASK(mach);
3577   }
3578}
3579
3580static void
3581exec_switch(struct tgsi_exec_machine *mach,
3582            const struct tgsi_full_instruction *inst)
3583{
3584   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3585   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3586
3587   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3588   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3589   mach->Switch.mask = 0x0;
3590   mach->Switch.defaultMask = 0x0;
3591
3592   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3593   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3594
3595   UPDATE_EXEC_MASK(mach);
3596}
3597
3598static void
3599exec_case(struct tgsi_exec_machine *mach,
3600          const struct tgsi_full_instruction *inst)
3601{
3602   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3603   union tgsi_exec_channel src;
3604   uint mask = 0;
3605
3606   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3607
3608   if (mach->Switch.selector.u[0] == src.u[0]) {
3609      mask |= 0x1;
3610   }
3611   if (mach->Switch.selector.u[1] == src.u[1]) {
3612      mask |= 0x2;
3613   }
3614   if (mach->Switch.selector.u[2] == src.u[2]) {
3615      mask |= 0x4;
3616   }
3617   if (mach->Switch.selector.u[3] == src.u[3]) {
3618      mask |= 0x8;
3619   }
3620
3621   mach->Switch.defaultMask |= mask;
3622
3623   mach->Switch.mask |= mask & prevMask;
3624
3625   UPDATE_EXEC_MASK(mach);
3626}
3627
3628/* FIXME: this will only work if default is last */
3629static void
3630exec_default(struct tgsi_exec_machine *mach)
3631{
3632   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3633
3634   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3635
3636   UPDATE_EXEC_MASK(mach);
3637}
3638
3639static void
3640exec_endswitch(struct tgsi_exec_machine *mach)
3641{
3642   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3643   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3644
3645   UPDATE_EXEC_MASK(mach);
3646}
3647
3648typedef void (* micro_dop)(union tgsi_double_channel *dst,
3649                           const union tgsi_double_channel *src);
3650
3651typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
3652                               const union tgsi_double_channel *src0,
3653                               union tgsi_exec_channel *src1);
3654
3655typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
3656                             const union tgsi_exec_channel *src);
3657
3658typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
3659                             const union tgsi_double_channel *src);
3660
3661static void
3662fetch_double_channel(struct tgsi_exec_machine *mach,
3663                     union tgsi_double_channel *chan,
3664                     const struct tgsi_full_src_register *reg,
3665                     uint chan_0,
3666                     uint chan_1)
3667{
3668   union tgsi_exec_channel src[2];
3669   uint i;
3670
3671   fetch_source_d(mach, &src[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
3672   fetch_source_d(mach, &src[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
3673
3674   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3675      chan->u[i][0] = src[0].u[i];
3676      chan->u[i][1] = src[1].u[i];
3677   }
3678   if (reg->Register.Absolute) {
3679      micro_dabs(chan, chan);
3680   }
3681   if (reg->Register.Negate) {
3682      micro_dneg(chan, chan);
3683   }
3684}
3685
3686static void
3687store_double_channel(struct tgsi_exec_machine *mach,
3688                     const union tgsi_double_channel *chan,
3689                     const struct tgsi_full_dst_register *reg,
3690                     const struct tgsi_full_instruction *inst,
3691                     uint chan_0,
3692                     uint chan_1)
3693{
3694   union tgsi_exec_channel dst[2];
3695   uint i;
3696   union tgsi_double_channel temp;
3697   const uint execmask = mach->ExecMask;
3698
3699   if (!inst->Instruction.Saturate) {
3700      for (i = 0; i < TGSI_QUAD_SIZE; i++)
3701         if (execmask & (1 << i)) {
3702            dst[0].u[i] = chan->u[i][0];
3703            dst[1].u[i] = chan->u[i][1];
3704         }
3705   }
3706   else {
3707      for (i = 0; i < TGSI_QUAD_SIZE; i++)
3708         if (execmask & (1 << i)) {
3709            if (chan->d[i] < 0.0)
3710               temp.d[i] = 0.0;
3711            else if (chan->d[i] > 1.0)
3712               temp.d[i] = 1.0;
3713            else
3714               temp.d[i] = chan->d[i];
3715
3716            dst[0].u[i] = temp.u[i][0];
3717            dst[1].u[i] = temp.u[i][1];
3718         }
3719   }
3720
3721   store_dest_double(mach, &dst[0], reg, inst, chan_0, TGSI_EXEC_DATA_UINT);
3722   if (chan_1 != -1)
3723      store_dest_double(mach, &dst[1], reg, inst, chan_1, TGSI_EXEC_DATA_UINT);
3724}
3725
3726static void
3727exec_double_unary(struct tgsi_exec_machine *mach,
3728                  const struct tgsi_full_instruction *inst,
3729                  micro_dop op)
3730{
3731   union tgsi_double_channel src;
3732   union tgsi_double_channel dst;
3733
3734   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3735      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3736      op(&dst, &src);
3737      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3738   }
3739   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3740      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3741      op(&dst, &src);
3742      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3743   }
3744}
3745
3746static void
3747exec_double_binary(struct tgsi_exec_machine *mach,
3748                   const struct tgsi_full_instruction *inst,
3749                   micro_dop op,
3750                   enum tgsi_exec_datatype dst_datatype)
3751{
3752   union tgsi_double_channel src[2];
3753   union tgsi_double_channel dst;
3754   int first_dest_chan, second_dest_chan;
3755   int wmask;
3756
3757   wmask = inst->Dst[0].Register.WriteMask;
3758   /* these are & because of the way DSLT etc store their destinations */
3759   if (wmask & TGSI_WRITEMASK_XY) {
3760      first_dest_chan = TGSI_CHAN_X;
3761      second_dest_chan = TGSI_CHAN_Y;
3762      if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3763         first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
3764         second_dest_chan = -1;
3765      }
3766
3767      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3768      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3769      op(&dst, src);
3770      store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3771   }
3772
3773   if (wmask & TGSI_WRITEMASK_ZW) {
3774      first_dest_chan = TGSI_CHAN_Z;
3775      second_dest_chan = TGSI_CHAN_W;
3776      if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3777         first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
3778         second_dest_chan = -1;
3779      }
3780
3781      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3782      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3783      op(&dst, src);
3784      store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3785   }
3786}
3787
3788static void
3789exec_double_trinary(struct tgsi_exec_machine *mach,
3790                    const struct tgsi_full_instruction *inst,
3791                    micro_dop op)
3792{
3793   union tgsi_double_channel src[3];
3794   union tgsi_double_channel dst;
3795
3796   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3797      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3798      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3799      fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
3800      op(&dst, src);
3801      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3802   }
3803   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3804      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3805      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3806      fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
3807      op(&dst, src);
3808      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3809   }
3810}
3811
3812static void
3813exec_dldexp(struct tgsi_exec_machine *mach,
3814            const struct tgsi_full_instruction *inst)
3815{
3816   union tgsi_double_channel src0;
3817   union tgsi_exec_channel src1;
3818   union tgsi_double_channel dst;
3819   int wmask;
3820
3821   wmask = inst->Dst[0].Register.WriteMask;
3822   if (wmask & TGSI_WRITEMASK_XY) {
3823      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3824      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3825      micro_dldexp(&dst, &src0, &src1);
3826      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3827   }
3828
3829   if (wmask & TGSI_WRITEMASK_ZW) {
3830      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3831      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3832      micro_dldexp(&dst, &src0, &src1);
3833      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3834   }
3835}
3836
3837static void
3838exec_dfracexp(struct tgsi_exec_machine *mach,
3839              const struct tgsi_full_instruction *inst)
3840{
3841   union tgsi_double_channel src;
3842   union tgsi_double_channel dst;
3843   union tgsi_exec_channel dst_exp;
3844
3845   if (((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)) {
3846      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3847      micro_dfracexp(&dst, &dst_exp, &src);
3848      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3849      store_dest(mach, &dst_exp, &inst->Dst[1], inst, ffs(inst->Dst[1].Register.WriteMask) - 1, TGSI_EXEC_DATA_INT);
3850   }
3851   if (((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)) {
3852      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3853      micro_dfracexp(&dst, &dst_exp, &src);
3854      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3855      store_dest(mach, &dst_exp, &inst->Dst[1], inst, ffs(inst->Dst[1].Register.WriteMask) - 1, TGSI_EXEC_DATA_INT);
3856   }
3857}
3858
3859static void
3860exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
3861            const struct tgsi_full_instruction *inst,
3862            micro_dop_sop op)
3863{
3864   union tgsi_double_channel src0;
3865   union tgsi_exec_channel src1;
3866   union tgsi_double_channel dst;
3867   int wmask;
3868
3869   wmask = inst->Dst[0].Register.WriteMask;
3870   if (wmask & TGSI_WRITEMASK_XY) {
3871      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3872      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3873      op(&dst, &src0, &src1);
3874      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3875   }
3876
3877   if (wmask & TGSI_WRITEMASK_ZW) {
3878      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3879      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3880      op(&dst, &src0, &src1);
3881      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3882   }
3883}
3884
3885static int
3886get_image_coord_dim(unsigned tgsi_tex)
3887{
3888   int dim;
3889   switch (tgsi_tex) {
3890   case TGSI_TEXTURE_BUFFER:
3891   case TGSI_TEXTURE_1D:
3892      dim = 1;
3893      break;
3894   case TGSI_TEXTURE_2D:
3895   case TGSI_TEXTURE_RECT:
3896   case TGSI_TEXTURE_1D_ARRAY:
3897   case TGSI_TEXTURE_2D_MSAA:
3898      dim = 2;
3899      break;
3900   case TGSI_TEXTURE_3D:
3901   case TGSI_TEXTURE_CUBE:
3902   case TGSI_TEXTURE_2D_ARRAY:
3903   case TGSI_TEXTURE_2D_ARRAY_MSAA:
3904   case TGSI_TEXTURE_CUBE_ARRAY:
3905      dim = 3;
3906      break;
3907   default:
3908      assert(!"unknown texture target");
3909      dim = 0;
3910      break;
3911   }
3912
3913   return dim;
3914}
3915
3916static int
3917get_image_coord_sample(unsigned tgsi_tex)
3918{
3919   int sample = 0;
3920   switch (tgsi_tex) {
3921   case TGSI_TEXTURE_2D_MSAA:
3922      sample = 3;
3923      break;
3924   case TGSI_TEXTURE_2D_ARRAY_MSAA:
3925      sample = 4;
3926      break;
3927   default:
3928      break;
3929   }
3930   return sample;
3931}
3932
3933static void
3934exec_load_img(struct tgsi_exec_machine *mach,
3935              const struct tgsi_full_instruction *inst)
3936{
3937   union tgsi_exec_channel r[4], sample_r;
3938   uint unit;
3939   int sample;
3940   int i, j;
3941   int dim;
3942   uint chan;
3943   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3944   struct tgsi_image_params params;
3945   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3946
3947   unit = fetch_sampler_unit(mach, inst, 0);
3948   dim = get_image_coord_dim(inst->Memory.Texture);
3949   sample = get_image_coord_sample(inst->Memory.Texture);
3950   assert(dim <= 3);
3951
3952   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3953   params.unit = unit;
3954   params.tgsi_tex_instr = inst->Memory.Texture;
3955   params.format = inst->Memory.Format;
3956
3957   for (i = 0; i < dim; i++) {
3958      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
3959   }
3960
3961   if (sample)
3962      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
3963
3964   mach->Image->load(mach->Image, &params,
3965                     r[0].i, r[1].i, r[2].i, sample_r.i,
3966                     rgba);
3967   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3968      r[0].f[j] = rgba[0][j];
3969      r[1].f[j] = rgba[1][j];
3970      r[2].f[j] = rgba[2][j];
3971      r[3].f[j] = rgba[3][j];
3972   }
3973   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3974      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3975         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3976      }
3977   }
3978}
3979
3980static void
3981exec_load_buf(struct tgsi_exec_machine *mach,
3982              const struct tgsi_full_instruction *inst)
3983{
3984   union tgsi_exec_channel r[4];
3985   uint unit;
3986   int j;
3987   uint chan;
3988   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3989   struct tgsi_buffer_params params;
3990   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3991
3992   unit = fetch_sampler_unit(mach, inst, 0);
3993
3994   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3995   params.unit = unit;
3996   IFETCH(&r[0], 1, TGSI_CHAN_X);
3997
3998   mach->Buffer->load(mach->Buffer, &params,
3999                      r[0].i, rgba);
4000   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4001      r[0].f[j] = rgba[0][j];
4002      r[1].f[j] = rgba[1][j];
4003      r[2].f[j] = rgba[2][j];
4004      r[3].f[j] = rgba[3][j];
4005   }
4006   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4007      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4008         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4009      }
4010   }
4011}
4012
4013static void
4014exec_load_mem(struct tgsi_exec_machine *mach,
4015              const struct tgsi_full_instruction *inst)
4016{
4017   union tgsi_exec_channel r[4];
4018   uint chan;
4019   char *ptr = mach->LocalMem;
4020   uint32_t offset;
4021   int j;
4022
4023   IFETCH(&r[0], 1, TGSI_CHAN_X);
4024   if (r[0].u[0] >= mach->LocalMemSize)
4025      return;
4026
4027   offset = r[0].u[0];
4028   ptr += offset;
4029
4030   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4031      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4032         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4033            memcpy(&r[chan].u[j], ptr + (4 * chan), 4);
4034         }
4035      }
4036   }
4037
4038   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4039      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4040         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4041      }
4042   }
4043}
4044
4045static void
4046exec_load(struct tgsi_exec_machine *mach,
4047          const struct tgsi_full_instruction *inst)
4048{
4049   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4050      exec_load_img(mach, inst);
4051   else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
4052      exec_load_buf(mach, inst);
4053   else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4054      exec_load_mem(mach, inst);
4055}
4056
4057static void
4058exec_store_img(struct tgsi_exec_machine *mach,
4059               const struct tgsi_full_instruction *inst)
4060{
4061   union tgsi_exec_channel r[3], sample_r;
4062   union tgsi_exec_channel value[4];
4063   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4064   struct tgsi_image_params params;
4065   int dim;
4066   int sample;
4067   int i, j;
4068   uint unit;
4069   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4070   unit = inst->Dst[0].Register.Index;
4071   dim = get_image_coord_dim(inst->Memory.Texture);
4072   sample = get_image_coord_sample(inst->Memory.Texture);
4073   assert(dim <= 3);
4074
4075   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4076   params.unit = unit;
4077   params.tgsi_tex_instr = inst->Memory.Texture;
4078   params.format = inst->Memory.Format;
4079
4080   for (i = 0; i < dim; i++) {
4081      IFETCH(&r[i], 0, TGSI_CHAN_X + i);
4082   }
4083
4084   for (i = 0; i < 4; i++) {
4085      FETCH(&value[i], 1, TGSI_CHAN_X + i);
4086   }
4087   if (sample)
4088      IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
4089
4090   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4091      rgba[0][j] = value[0].f[j];
4092      rgba[1][j] = value[1].f[j];
4093      rgba[2][j] = value[2].f[j];
4094      rgba[3][j] = value[3].f[j];
4095   }
4096
4097   mach->Image->store(mach->Image, &params,
4098                      r[0].i, r[1].i, r[2].i, sample_r.i,
4099                      rgba);
4100}
4101
4102static void
4103exec_store_buf(struct tgsi_exec_machine *mach,
4104               const struct tgsi_full_instruction *inst)
4105{
4106   union tgsi_exec_channel r[3];
4107   union tgsi_exec_channel value[4];
4108   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4109   struct tgsi_buffer_params params;
4110   int i, j;
4111   uint unit;
4112   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4113
4114   unit = inst->Dst[0].Register.Index;
4115
4116   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4117   params.unit = unit;
4118   params.writemask = inst->Dst[0].Register.WriteMask;
4119
4120   IFETCH(&r[0], 0, TGSI_CHAN_X);
4121   for (i = 0; i < 4; i++) {
4122      FETCH(&value[i], 1, TGSI_CHAN_X + i);
4123   }
4124
4125   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4126      rgba[0][j] = value[0].f[j];
4127      rgba[1][j] = value[1].f[j];
4128      rgba[2][j] = value[2].f[j];
4129      rgba[3][j] = value[3].f[j];
4130   }
4131
4132   mach->Buffer->store(mach->Buffer, &params,
4133                      r[0].i,
4134                      rgba);
4135}
4136
4137static void
4138exec_store_mem(struct tgsi_exec_machine *mach,
4139               const struct tgsi_full_instruction *inst)
4140{
4141   union tgsi_exec_channel r[3];
4142   union tgsi_exec_channel value[4];
4143   uint i, chan;
4144   char *ptr = mach->LocalMem;
4145   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4146   int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4147
4148   IFETCH(&r[0], 0, TGSI_CHAN_X);
4149
4150   for (i = 0; i < 4; i++) {
4151      FETCH(&value[i], 1, TGSI_CHAN_X + i);
4152   }
4153
4154   if (r[0].u[0] >= mach->LocalMemSize)
4155      return;
4156   ptr += r[0].u[0];
4157
4158   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4159      if (execmask & (1 << i)) {
4160         for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4161            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4162               memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
4163            }
4164         }
4165      }
4166   }
4167}
4168
4169static void
4170exec_store(struct tgsi_exec_machine *mach,
4171           const struct tgsi_full_instruction *inst)
4172{
4173   if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
4174      exec_store_img(mach, inst);
4175   else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
4176      exec_store_buf(mach, inst);
4177   else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
4178      exec_store_mem(mach, inst);
4179}
4180
4181static void
4182exec_atomop_img(struct tgsi_exec_machine *mach,
4183                const struct tgsi_full_instruction *inst)
4184{
4185   union tgsi_exec_channel r[4], sample_r;
4186   union tgsi_exec_channel value[4], value2[4];
4187   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4188   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4189   struct tgsi_image_params params;
4190   int dim;
4191   int sample;
4192   int i, j;
4193   uint unit, chan;
4194   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4195   unit = fetch_sampler_unit(mach, inst, 0);
4196   dim = get_image_coord_dim(inst->Memory.Texture);
4197   sample = get_image_coord_sample(inst->Memory.Texture);
4198   assert(dim <= 3);
4199
4200   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4201   params.unit = unit;
4202   params.tgsi_tex_instr = inst->Memory.Texture;
4203   params.format = inst->Memory.Format;
4204
4205   for (i = 0; i < dim; i++) {
4206      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
4207   }
4208
4209   for (i = 0; i < 4; i++) {
4210      FETCH(&value[i], 2, TGSI_CHAN_X + i);
4211      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4212         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4213   }
4214   if (sample)
4215      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
4216
4217   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4218      rgba[0][j] = value[0].f[j];
4219      rgba[1][j] = value[1].f[j];
4220      rgba[2][j] = value[2].f[j];
4221      rgba[3][j] = value[3].f[j];
4222   }
4223   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4224      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4225         rgba2[0][j] = value2[0].f[j];
4226         rgba2[1][j] = value2[1].f[j];
4227         rgba2[2][j] = value2[2].f[j];
4228         rgba2[3][j] = value2[3].f[j];
4229      }
4230   }
4231
4232   mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
4233                   r[0].i, r[1].i, r[2].i, sample_r.i,
4234                   rgba, rgba2);
4235
4236   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4237      r[0].f[j] = rgba[0][j];
4238      r[1].f[j] = rgba[1][j];
4239      r[2].f[j] = rgba[2][j];
4240      r[3].f[j] = rgba[3][j];
4241   }
4242   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4243      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4244         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4245      }
4246   }
4247}
4248
4249static void
4250exec_atomop_buf(struct tgsi_exec_machine *mach,
4251                const struct tgsi_full_instruction *inst)
4252{
4253   union tgsi_exec_channel r[4];
4254   union tgsi_exec_channel value[4], value2[4];
4255   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4256   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4257   struct tgsi_buffer_params params;
4258   int i, j;
4259   uint unit, chan;
4260   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4261
4262   unit = fetch_sampler_unit(mach, inst, 0);
4263
4264   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4265   params.unit = unit;
4266   params.writemask = inst->Dst[0].Register.WriteMask;
4267
4268   IFETCH(&r[0], 1, TGSI_CHAN_X);
4269
4270   for (i = 0; i < 4; i++) {
4271      FETCH(&value[i], 2, TGSI_CHAN_X + i);
4272      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4273         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4274   }
4275
4276   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4277      rgba[0][j] = value[0].f[j];
4278      rgba[1][j] = value[1].f[j];
4279      rgba[2][j] = value[2].f[j];
4280      rgba[3][j] = value[3].f[j];
4281   }
4282   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4283      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4284         rgba2[0][j] = value2[0].f[j];
4285         rgba2[1][j] = value2[1].f[j];
4286         rgba2[2][j] = value2[2].f[j];
4287         rgba2[3][j] = value2[3].f[j];
4288      }
4289   }
4290
4291   mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
4292                   r[0].i,
4293                   rgba, rgba2);
4294
4295   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4296      r[0].f[j] = rgba[0][j];
4297      r[1].f[j] = rgba[1][j];
4298      r[2].f[j] = rgba[2][j];
4299      r[3].f[j] = rgba[3][j];
4300   }
4301   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4302      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4303         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4304      }
4305   }
4306}
4307
4308static void
4309exec_atomop_mem(struct tgsi_exec_machine *mach,
4310                const struct tgsi_full_instruction *inst)
4311{
4312   union tgsi_exec_channel r[4];
4313   union tgsi_exec_channel value[4], value2[4];
4314   char *ptr = mach->LocalMem;
4315   uint32_t val;
4316   uint chan, i;
4317   uint32_t offset;
4318   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4319   int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4320   IFETCH(&r[0], 1, TGSI_CHAN_X);
4321
4322   if (r[0].u[0] >= mach->LocalMemSize)
4323      return;
4324
4325   offset = r[0].u[0];
4326   ptr += offset;
4327   for (i = 0; i < 4; i++) {
4328      FETCH(&value[i], 2, TGSI_CHAN_X + i);
4329      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4330         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4331   }
4332
4333   memcpy(&r[0].u[0], ptr, 4);
4334   val = r[0].u[0];
4335   switch (inst->Instruction.Opcode) {
4336   case TGSI_OPCODE_ATOMUADD:
4337      val += value[0].u[0];
4338      break;
4339   case TGSI_OPCODE_ATOMXOR:
4340      val ^= value[0].u[0];
4341      break;
4342   case TGSI_OPCODE_ATOMOR:
4343      val |= value[0].u[0];
4344      break;
4345   case TGSI_OPCODE_ATOMAND:
4346      val &= value[0].u[0];
4347      break;
4348   case TGSI_OPCODE_ATOMUMIN:
4349      val = MIN2(val, value[0].u[0]);
4350      break;
4351   case TGSI_OPCODE_ATOMUMAX:
4352      val = MAX2(val, value[0].u[0]);
4353      break;
4354   case TGSI_OPCODE_ATOMIMIN:
4355      val = MIN2(r[0].i[0], value[0].i[0]);
4356      break;
4357   case TGSI_OPCODE_ATOMIMAX:
4358      val = MAX2(r[0].i[0], value[0].i[0]);
4359      break;
4360   case TGSI_OPCODE_ATOMXCHG:
4361      val = value[0].i[0];
4362      break;
4363   case TGSI_OPCODE_ATOMCAS:
4364      if (val == value[0].u[0])
4365         val = value2[0].u[0];
4366      break;
4367   default:
4368      break;
4369   }
4370   for (i = 0; i < TGSI_QUAD_SIZE; i++)
4371      if (execmask & (1 << i))
4372         memcpy(ptr, &val, 4);
4373
4374   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4375      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4376         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4377      }
4378   }
4379}
4380
4381static void
4382exec_atomop(struct tgsi_exec_machine *mach,
4383            const struct tgsi_full_instruction *inst)
4384{
4385   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4386      exec_atomop_img(mach, inst);
4387   else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
4388      exec_atomop_buf(mach, inst);
4389   else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4390      exec_atomop_mem(mach, inst);
4391}
4392
4393static void
4394exec_resq_img(struct tgsi_exec_machine *mach,
4395              const struct tgsi_full_instruction *inst)
4396{
4397   int result[4];
4398   union tgsi_exec_channel r[4];
4399   uint unit;
4400   int i, chan, j;
4401   struct tgsi_image_params params;
4402   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4403
4404   unit = fetch_sampler_unit(mach, inst, 0);
4405
4406   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4407   params.unit = unit;
4408   params.tgsi_tex_instr = inst->Memory.Texture;
4409   params.format = inst->Memory.Format;
4410
4411   mach->Image->get_dims(mach->Image, &params, result);
4412
4413   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4414      for (j = 0; j < 4; j++) {
4415         r[j].i[i] = result[j];
4416      }
4417   }
4418
4419   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4420      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4421         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4422                    TGSI_EXEC_DATA_INT);
4423      }
4424   }
4425}
4426
4427static void
4428exec_resq_buf(struct tgsi_exec_machine *mach,
4429              const struct tgsi_full_instruction *inst)
4430{
4431   int result;
4432   union tgsi_exec_channel r[4];
4433   uint unit;
4434   int i, chan;
4435   struct tgsi_buffer_params params;
4436   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4437
4438   unit = fetch_sampler_unit(mach, inst, 0);
4439
4440   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4441   params.unit = unit;
4442
4443   mach->Buffer->get_dims(mach->Buffer, &params, &result);
4444
4445   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4446      r[0].i[i] = result;
4447   }
4448
4449   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4450      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4451         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4452                    TGSI_EXEC_DATA_INT);
4453      }
4454   }
4455}
4456
4457static void
4458exec_resq(struct tgsi_exec_machine *mach,
4459          const struct tgsi_full_instruction *inst)
4460{
4461   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4462      exec_resq_img(mach, inst);
4463   else
4464      exec_resq_buf(mach, inst);
4465}
4466
4467static void
4468micro_f2u64(union tgsi_double_channel *dst,
4469            const union tgsi_exec_channel *src)
4470{
4471   dst->u64[0] = (uint64_t)src->f[0];
4472   dst->u64[1] = (uint64_t)src->f[1];
4473   dst->u64[2] = (uint64_t)src->f[2];
4474   dst->u64[3] = (uint64_t)src->f[3];
4475}
4476
4477static void
4478micro_f2i64(union tgsi_double_channel *dst,
4479            const union tgsi_exec_channel *src)
4480{
4481   dst->i64[0] = (int64_t)src->f[0];
4482   dst->i64[1] = (int64_t)src->f[1];
4483   dst->i64[2] = (int64_t)src->f[2];
4484   dst->i64[3] = (int64_t)src->f[3];
4485}
4486
4487static void
4488micro_u2i64(union tgsi_double_channel *dst,
4489            const union tgsi_exec_channel *src)
4490{
4491   dst->u64[0] = (uint64_t)src->u[0];
4492   dst->u64[1] = (uint64_t)src->u[1];
4493   dst->u64[2] = (uint64_t)src->u[2];
4494   dst->u64[3] = (uint64_t)src->u[3];
4495}
4496
4497static void
4498micro_i2i64(union tgsi_double_channel *dst,
4499            const union tgsi_exec_channel *src)
4500{
4501   dst->i64[0] = (int64_t)src->i[0];
4502   dst->i64[1] = (int64_t)src->i[1];
4503   dst->i64[2] = (int64_t)src->i[2];
4504   dst->i64[3] = (int64_t)src->i[3];
4505}
4506
4507static void
4508micro_d2u64(union tgsi_double_channel *dst,
4509           const union tgsi_double_channel *src)
4510{
4511   dst->u64[0] = (uint64_t)src->d[0];
4512   dst->u64[1] = (uint64_t)src->d[1];
4513   dst->u64[2] = (uint64_t)src->d[2];
4514   dst->u64[3] = (uint64_t)src->d[3];
4515}
4516
4517static void
4518micro_d2i64(union tgsi_double_channel *dst,
4519           const union tgsi_double_channel *src)
4520{
4521   dst->i64[0] = (int64_t)src->d[0];
4522   dst->i64[1] = (int64_t)src->d[1];
4523   dst->i64[2] = (int64_t)src->d[2];
4524   dst->i64[3] = (int64_t)src->d[3];
4525}
4526
4527static void
4528micro_u642d(union tgsi_double_channel *dst,
4529           const union tgsi_double_channel *src)
4530{
4531   dst->d[0] = (double)src->u64[0];
4532   dst->d[1] = (double)src->u64[1];
4533   dst->d[2] = (double)src->u64[2];
4534   dst->d[3] = (double)src->u64[3];
4535}
4536
4537static void
4538micro_i642d(union tgsi_double_channel *dst,
4539           const union tgsi_double_channel *src)
4540{
4541   dst->d[0] = (double)src->i64[0];
4542   dst->d[1] = (double)src->i64[1];
4543   dst->d[2] = (double)src->i64[2];
4544   dst->d[3] = (double)src->i64[3];
4545}
4546
4547static void
4548micro_u642f(union tgsi_exec_channel *dst,
4549            const union tgsi_double_channel *src)
4550{
4551   dst->f[0] = (float)src->u64[0];
4552   dst->f[1] = (float)src->u64[1];
4553   dst->f[2] = (float)src->u64[2];
4554   dst->f[3] = (float)src->u64[3];
4555}
4556
4557static void
4558micro_i642f(union tgsi_exec_channel *dst,
4559            const union tgsi_double_channel *src)
4560{
4561   dst->f[0] = (float)src->i64[0];
4562   dst->f[1] = (float)src->i64[1];
4563   dst->f[2] = (float)src->i64[2];
4564   dst->f[3] = (float)src->i64[3];
4565}
4566
4567static void
4568exec_t_2_64(struct tgsi_exec_machine *mach,
4569          const struct tgsi_full_instruction *inst,
4570          micro_dop_s op,
4571          enum tgsi_exec_datatype src_datatype)
4572{
4573   union tgsi_exec_channel src;
4574   union tgsi_double_channel dst;
4575
4576   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
4577      fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
4578      op(&dst, &src);
4579      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
4580   }
4581   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
4582      fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, src_datatype);
4583      op(&dst, &src);
4584      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
4585   }
4586}
4587
4588static void
4589exec_64_2_t(struct tgsi_exec_machine *mach,
4590            const struct tgsi_full_instruction *inst,
4591            micro_sop_d op,
4592            enum tgsi_exec_datatype dst_datatype)
4593{
4594   union tgsi_double_channel src;
4595   union tgsi_exec_channel dst;
4596   int wm = inst->Dst[0].Register.WriteMask;
4597   int i;
4598   int bit;
4599   for (i = 0; i < 2; i++) {
4600      bit = ffs(wm);
4601      if (bit) {
4602         wm &= ~(1 << (bit - 1));
4603         if (i == 0)
4604            fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
4605         else
4606            fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
4607         op(&dst, &src);
4608         store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, dst_datatype);
4609      }
4610   }
4611}
4612
4613static void
4614micro_i2f(union tgsi_exec_channel *dst,
4615          const union tgsi_exec_channel *src)
4616{
4617   dst->f[0] = (float)src->i[0];
4618   dst->f[1] = (float)src->i[1];
4619   dst->f[2] = (float)src->i[2];
4620   dst->f[3] = (float)src->i[3];
4621}
4622
4623static void
4624micro_not(union tgsi_exec_channel *dst,
4625          const union tgsi_exec_channel *src)
4626{
4627   dst->u[0] = ~src->u[0];
4628   dst->u[1] = ~src->u[1];
4629   dst->u[2] = ~src->u[2];
4630   dst->u[3] = ~src->u[3];
4631}
4632
4633static void
4634micro_shl(union tgsi_exec_channel *dst,
4635          const union tgsi_exec_channel *src0,
4636          const union tgsi_exec_channel *src1)
4637{
4638   unsigned masked_count;
4639   masked_count = src1->u[0] & 0x1f;
4640   dst->u[0] = src0->u[0] << masked_count;
4641   masked_count = src1->u[1] & 0x1f;
4642   dst->u[1] = src0->u[1] << masked_count;
4643   masked_count = src1->u[2] & 0x1f;
4644   dst->u[2] = src0->u[2] << masked_count;
4645   masked_count = src1->u[3] & 0x1f;
4646   dst->u[3] = src0->u[3] << masked_count;
4647}
4648
4649static void
4650micro_and(union tgsi_exec_channel *dst,
4651          const union tgsi_exec_channel *src0,
4652          const union tgsi_exec_channel *src1)
4653{
4654   dst->u[0] = src0->u[0] & src1->u[0];
4655   dst->u[1] = src0->u[1] & src1->u[1];
4656   dst->u[2] = src0->u[2] & src1->u[2];
4657   dst->u[3] = src0->u[3] & src1->u[3];
4658}
4659
4660static void
4661micro_or(union tgsi_exec_channel *dst,
4662         const union tgsi_exec_channel *src0,
4663         const union tgsi_exec_channel *src1)
4664{
4665   dst->u[0] = src0->u[0] | src1->u[0];
4666   dst->u[1] = src0->u[1] | src1->u[1];
4667   dst->u[2] = src0->u[2] | src1->u[2];
4668   dst->u[3] = src0->u[3] | src1->u[3];
4669}
4670
4671static void
4672micro_xor(union tgsi_exec_channel *dst,
4673          const union tgsi_exec_channel *src0,
4674          const union tgsi_exec_channel *src1)
4675{
4676   dst->u[0] = src0->u[0] ^ src1->u[0];
4677   dst->u[1] = src0->u[1] ^ src1->u[1];
4678   dst->u[2] = src0->u[2] ^ src1->u[2];
4679   dst->u[3] = src0->u[3] ^ src1->u[3];
4680}
4681
4682static void
4683micro_mod(union tgsi_exec_channel *dst,
4684          const union tgsi_exec_channel *src0,
4685          const union tgsi_exec_channel *src1)
4686{
4687   dst->i[0] = src0->i[0] % src1->i[0];
4688   dst->i[1] = src0->i[1] % src1->i[1];
4689   dst->i[2] = src0->i[2] % src1->i[2];
4690   dst->i[3] = src0->i[3] % src1->i[3];
4691}
4692
4693static void
4694micro_f2i(union tgsi_exec_channel *dst,
4695          const union tgsi_exec_channel *src)
4696{
4697   dst->i[0] = (int)src->f[0];
4698   dst->i[1] = (int)src->f[1];
4699   dst->i[2] = (int)src->f[2];
4700   dst->i[3] = (int)src->f[3];
4701}
4702
4703static void
4704micro_fseq(union tgsi_exec_channel *dst,
4705           const union tgsi_exec_channel *src0,
4706           const union tgsi_exec_channel *src1)
4707{
4708   dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
4709   dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
4710   dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
4711   dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
4712}
4713
4714static void
4715micro_fsge(union tgsi_exec_channel *dst,
4716           const union tgsi_exec_channel *src0,
4717           const union tgsi_exec_channel *src1)
4718{
4719   dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
4720   dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
4721   dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
4722   dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
4723}
4724
4725static void
4726micro_fslt(union tgsi_exec_channel *dst,
4727           const union tgsi_exec_channel *src0,
4728           const union tgsi_exec_channel *src1)
4729{
4730   dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
4731   dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
4732   dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
4733   dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
4734}
4735
4736static void
4737micro_fsne(union tgsi_exec_channel *dst,
4738           const union tgsi_exec_channel *src0,
4739           const union tgsi_exec_channel *src1)
4740{
4741   dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
4742   dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
4743   dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
4744   dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
4745}
4746
4747static void
4748micro_idiv(union tgsi_exec_channel *dst,
4749           const union tgsi_exec_channel *src0,
4750           const union tgsi_exec_channel *src1)
4751{
4752   dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
4753   dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
4754   dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
4755   dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
4756}
4757
4758static void
4759micro_imax(union tgsi_exec_channel *dst,
4760           const union tgsi_exec_channel *src0,
4761           const union tgsi_exec_channel *src1)
4762{
4763   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
4764   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
4765   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
4766   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
4767}
4768
4769static void
4770micro_imin(union tgsi_exec_channel *dst,
4771           const union tgsi_exec_channel *src0,
4772           const union tgsi_exec_channel *src1)
4773{
4774   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
4775   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
4776   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
4777   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
4778}
4779
4780static void
4781micro_isge(union tgsi_exec_channel *dst,
4782           const union tgsi_exec_channel *src0,
4783           const union tgsi_exec_channel *src1)
4784{
4785   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
4786   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
4787   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
4788   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
4789}
4790
4791static void
4792micro_ishr(union tgsi_exec_channel *dst,
4793           const union tgsi_exec_channel *src0,
4794           const union tgsi_exec_channel *src1)
4795{
4796   unsigned masked_count;
4797   masked_count = src1->i[0] & 0x1f;
4798   dst->i[0] = src0->i[0] >> masked_count;
4799   masked_count = src1->i[1] & 0x1f;
4800   dst->i[1] = src0->i[1] >> masked_count;
4801   masked_count = src1->i[2] & 0x1f;
4802   dst->i[2] = src0->i[2] >> masked_count;
4803   masked_count = src1->i[3] & 0x1f;
4804   dst->i[3] = src0->i[3] >> masked_count;
4805}
4806
4807static void
4808micro_islt(union tgsi_exec_channel *dst,
4809           const union tgsi_exec_channel *src0,
4810           const union tgsi_exec_channel *src1)
4811{
4812   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
4813   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
4814   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
4815   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
4816}
4817
4818static void
4819micro_f2u(union tgsi_exec_channel *dst,
4820          const union tgsi_exec_channel *src)
4821{
4822   dst->u[0] = (uint)src->f[0];
4823   dst->u[1] = (uint)src->f[1];
4824   dst->u[2] = (uint)src->f[2];
4825   dst->u[3] = (uint)src->f[3];
4826}
4827
4828static void
4829micro_u2f(union tgsi_exec_channel *dst,
4830          const union tgsi_exec_channel *src)
4831{
4832   dst->f[0] = (float)src->u[0];
4833   dst->f[1] = (float)src->u[1];
4834   dst->f[2] = (float)src->u[2];
4835   dst->f[3] = (float)src->u[3];
4836}
4837
4838static void
4839micro_uadd(union tgsi_exec_channel *dst,
4840           const union tgsi_exec_channel *src0,
4841           const union tgsi_exec_channel *src1)
4842{
4843   dst->u[0] = src0->u[0] + src1->u[0];
4844   dst->u[1] = src0->u[1] + src1->u[1];
4845   dst->u[2] = src0->u[2] + src1->u[2];
4846   dst->u[3] = src0->u[3] + src1->u[3];
4847}
4848
4849static void
4850micro_udiv(union tgsi_exec_channel *dst,
4851           const union tgsi_exec_channel *src0,
4852           const union tgsi_exec_channel *src1)
4853{
4854   dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
4855   dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
4856   dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
4857   dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
4858}
4859
4860static void
4861micro_umad(union tgsi_exec_channel *dst,
4862           const union tgsi_exec_channel *src0,
4863           const union tgsi_exec_channel *src1,
4864           const union tgsi_exec_channel *src2)
4865{
4866   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
4867   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
4868   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
4869   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
4870}
4871
4872static void
4873micro_umax(union tgsi_exec_channel *dst,
4874           const union tgsi_exec_channel *src0,
4875           const union tgsi_exec_channel *src1)
4876{
4877   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
4878   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
4879   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
4880   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
4881}
4882
4883static void
4884micro_umin(union tgsi_exec_channel *dst,
4885           const union tgsi_exec_channel *src0,
4886           const union tgsi_exec_channel *src1)
4887{
4888   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
4889   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
4890   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
4891   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
4892}
4893
4894static void
4895micro_umod(union tgsi_exec_channel *dst,
4896           const union tgsi_exec_channel *src0,
4897           const union tgsi_exec_channel *src1)
4898{
4899   dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
4900   dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
4901   dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
4902   dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
4903}
4904
4905static void
4906micro_umul(union tgsi_exec_channel *dst,
4907           const union tgsi_exec_channel *src0,
4908           const union tgsi_exec_channel *src1)
4909{
4910   dst->u[0] = src0->u[0] * src1->u[0];
4911   dst->u[1] = src0->u[1] * src1->u[1];
4912   dst->u[2] = src0->u[2] * src1->u[2];
4913   dst->u[3] = src0->u[3] * src1->u[3];
4914}
4915
4916static void
4917micro_imul_hi(union tgsi_exec_channel *dst,
4918              const union tgsi_exec_channel *src0,
4919              const union tgsi_exec_channel *src1)
4920{
4921#define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
4922   dst->i[0] = I64M(src0->i[0], src1->i[0]);
4923   dst->i[1] = I64M(src0->i[1], src1->i[1]);
4924   dst->i[2] = I64M(src0->i[2], src1->i[2]);
4925   dst->i[3] = I64M(src0->i[3], src1->i[3]);
4926#undef I64M
4927}
4928
4929static void
4930micro_umul_hi(union tgsi_exec_channel *dst,
4931              const union tgsi_exec_channel *src0,
4932              const union tgsi_exec_channel *src1)
4933{
4934#define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
4935   dst->u[0] = U64M(src0->u[0], src1->u[0]);
4936   dst->u[1] = U64M(src0->u[1], src1->u[1]);
4937   dst->u[2] = U64M(src0->u[2], src1->u[2]);
4938   dst->u[3] = U64M(src0->u[3], src1->u[3]);
4939#undef U64M
4940}
4941
4942static void
4943micro_useq(union tgsi_exec_channel *dst,
4944           const union tgsi_exec_channel *src0,
4945           const union tgsi_exec_channel *src1)
4946{
4947   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
4948   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
4949   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
4950   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
4951}
4952
4953static void
4954micro_usge(union tgsi_exec_channel *dst,
4955           const union tgsi_exec_channel *src0,
4956           const union tgsi_exec_channel *src1)
4957{
4958   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
4959   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
4960   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
4961   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
4962}
4963
4964static void
4965micro_ushr(union tgsi_exec_channel *dst,
4966           const union tgsi_exec_channel *src0,
4967           const union tgsi_exec_channel *src1)
4968{
4969   unsigned masked_count;
4970   masked_count = src1->u[0] & 0x1f;
4971   dst->u[0] = src0->u[0] >> masked_count;
4972   masked_count = src1->u[1] & 0x1f;
4973   dst->u[1] = src0->u[1] >> masked_count;
4974   masked_count = src1->u[2] & 0x1f;
4975   dst->u[2] = src0->u[2] >> masked_count;
4976   masked_count = src1->u[3] & 0x1f;
4977   dst->u[3] = src0->u[3] >> masked_count;
4978}
4979
4980static void
4981micro_uslt(union tgsi_exec_channel *dst,
4982           const union tgsi_exec_channel *src0,
4983           const union tgsi_exec_channel *src1)
4984{
4985   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
4986   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
4987   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
4988   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
4989}
4990
4991static void
4992micro_usne(union tgsi_exec_channel *dst,
4993           const union tgsi_exec_channel *src0,
4994           const union tgsi_exec_channel *src1)
4995{
4996   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
4997   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
4998   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
4999   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
5000}
5001
5002static void
5003micro_uarl(union tgsi_exec_channel *dst,
5004           const union tgsi_exec_channel *src)
5005{
5006   dst->i[0] = src->u[0];
5007   dst->i[1] = src->u[1];
5008   dst->i[2] = src->u[2];
5009   dst->i[3] = src->u[3];
5010}
5011
5012static void
5013micro_ucmp(union tgsi_exec_channel *dst,
5014           const union tgsi_exec_channel *src0,
5015           const union tgsi_exec_channel *src1,
5016           const union tgsi_exec_channel *src2)
5017{
5018   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
5019   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
5020   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
5021   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
5022}
5023
5024/**
5025 * Signed bitfield extract (i.e. sign-extend the extracted bits)
5026 */
5027static void
5028micro_ibfe(union tgsi_exec_channel *dst,
5029           const union tgsi_exec_channel *src0,
5030           const union tgsi_exec_channel *src1,
5031           const union tgsi_exec_channel *src2)
5032{
5033   int i;
5034   for (i = 0; i < 4; i++) {
5035      int width = src2->i[i] & 0x1f;
5036      int offset = src1->i[i] & 0x1f;
5037      if (width == 0)
5038         dst->i[i] = 0;
5039      else if (width + offset < 32)
5040         dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
5041      else
5042         dst->i[i] = src0->i[i] >> offset;
5043   }
5044}
5045
5046/**
5047 * Unsigned bitfield extract
5048 */
5049static void
5050micro_ubfe(union tgsi_exec_channel *dst,
5051           const union tgsi_exec_channel *src0,
5052           const union tgsi_exec_channel *src1,
5053           const union tgsi_exec_channel *src2)
5054{
5055   int i;
5056   for (i = 0; i < 4; i++) {
5057      int width = src2->u[i] & 0x1f;
5058      int offset = src1->u[i] & 0x1f;
5059      if (width == 0)
5060         dst->u[i] = 0;
5061      else if (width + offset < 32)
5062         dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
5063      else
5064         dst->u[i] = src0->u[i] >> offset;
5065   }
5066}
5067
5068/**
5069 * Bitfield insert: copy low bits from src1 into a region of src0.
5070 */
5071static void
5072micro_bfi(union tgsi_exec_channel *dst,
5073          const union tgsi_exec_channel *src0,
5074          const union tgsi_exec_channel *src1,
5075          const union tgsi_exec_channel *src2,
5076          const union tgsi_exec_channel *src3)
5077{
5078   int i;
5079   for (i = 0; i < 4; i++) {
5080      int width = src3->u[i] & 0x1f;
5081      int offset = src2->u[i] & 0x1f;
5082      int bitmask = ((1 << width) - 1) << offset;
5083      dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
5084   }
5085}
5086
5087static void
5088micro_brev(union tgsi_exec_channel *dst,
5089           const union tgsi_exec_channel *src)
5090{
5091   dst->u[0] = util_bitreverse(src->u[0]);
5092   dst->u[1] = util_bitreverse(src->u[1]);
5093   dst->u[2] = util_bitreverse(src->u[2]);
5094   dst->u[3] = util_bitreverse(src->u[3]);
5095}
5096
5097static void
5098micro_popc(union tgsi_exec_channel *dst,
5099           const union tgsi_exec_channel *src)
5100{
5101   dst->u[0] = util_bitcount(src->u[0]);
5102   dst->u[1] = util_bitcount(src->u[1]);
5103   dst->u[2] = util_bitcount(src->u[2]);
5104   dst->u[3] = util_bitcount(src->u[3]);
5105}
5106
5107static void
5108micro_lsb(union tgsi_exec_channel *dst,
5109          const union tgsi_exec_channel *src)
5110{
5111   dst->i[0] = ffs(src->u[0]) - 1;
5112   dst->i[1] = ffs(src->u[1]) - 1;
5113   dst->i[2] = ffs(src->u[2]) - 1;
5114   dst->i[3] = ffs(src->u[3]) - 1;
5115}
5116
5117static void
5118micro_imsb(union tgsi_exec_channel *dst,
5119           const union tgsi_exec_channel *src)
5120{
5121   dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
5122   dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
5123   dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
5124   dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
5125}
5126
5127static void
5128micro_umsb(union tgsi_exec_channel *dst,
5129           const union tgsi_exec_channel *src)
5130{
5131   dst->i[0] = util_last_bit(src->u[0]) - 1;
5132   dst->i[1] = util_last_bit(src->u[1]) - 1;
5133   dst->i[2] = util_last_bit(src->u[2]) - 1;
5134   dst->i[3] = util_last_bit(src->u[3]) - 1;
5135}
5136
5137/**
5138 * Execute a TGSI instruction.
5139 * Returns TRUE if a barrier instruction is hit,
5140 * otherwise FALSE.
5141 */
5142static boolean
5143exec_instruction(
5144   struct tgsi_exec_machine *mach,
5145   const struct tgsi_full_instruction *inst,
5146   int *pc )
5147{
5148   union tgsi_exec_channel r[10];
5149
5150   (*pc)++;
5151
5152   switch (inst->Instruction.Opcode) {
5153   case TGSI_OPCODE_ARL:
5154      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5155      break;
5156
5157   case TGSI_OPCODE_MOV:
5158      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5159      break;
5160
5161   case TGSI_OPCODE_LIT:
5162      exec_lit(mach, inst);
5163      break;
5164
5165   case TGSI_OPCODE_RCP:
5166      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5167      break;
5168
5169   case TGSI_OPCODE_RSQ:
5170      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5171      break;
5172
5173   case TGSI_OPCODE_EXP:
5174      exec_exp(mach, inst);
5175      break;
5176
5177   case TGSI_OPCODE_LOG:
5178      exec_log(mach, inst);
5179      break;
5180
5181   case TGSI_OPCODE_MUL:
5182      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5183      break;
5184
5185   case TGSI_OPCODE_ADD:
5186      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5187      break;
5188
5189   case TGSI_OPCODE_DP3:
5190      exec_dp3(mach, inst);
5191      break;
5192
5193   case TGSI_OPCODE_DP4:
5194      exec_dp4(mach, inst);
5195      break;
5196
5197   case TGSI_OPCODE_DST:
5198      exec_dst(mach, inst);
5199      break;
5200
5201   case TGSI_OPCODE_MIN:
5202      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5203      break;
5204
5205   case TGSI_OPCODE_MAX:
5206      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5207      break;
5208
5209   case TGSI_OPCODE_SLT:
5210      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5211      break;
5212
5213   case TGSI_OPCODE_SGE:
5214      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5215      break;
5216
5217   case TGSI_OPCODE_MAD:
5218      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5219      break;
5220
5221   case TGSI_OPCODE_LRP:
5222      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5223      break;
5224
5225   case TGSI_OPCODE_SQRT:
5226      exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5227      break;
5228
5229   case TGSI_OPCODE_DP2A:
5230      exec_dp2a(mach, inst);
5231      break;
5232
5233   case TGSI_OPCODE_FRC:
5234      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5235      break;
5236
5237   case TGSI_OPCODE_CLAMP:
5238      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5239      break;
5240
5241   case TGSI_OPCODE_FLR:
5242      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5243      break;
5244
5245   case TGSI_OPCODE_ROUND:
5246      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5247      break;
5248
5249   case TGSI_OPCODE_EX2:
5250      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5251      break;
5252
5253   case TGSI_OPCODE_LG2:
5254      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5255      break;
5256
5257   case TGSI_OPCODE_POW:
5258      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5259      break;
5260
5261   case TGSI_OPCODE_XPD:
5262      exec_xpd(mach, inst);
5263      break;
5264
5265   case TGSI_OPCODE_DPH:
5266      exec_dph(mach, inst);
5267      break;
5268
5269   case TGSI_OPCODE_COS:
5270      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5271      break;
5272
5273   case TGSI_OPCODE_DDX:
5274      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5275      break;
5276
5277   case TGSI_OPCODE_DDY:
5278      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5279      break;
5280
5281   case TGSI_OPCODE_KILL:
5282      exec_kill (mach, inst);
5283      break;
5284
5285   case TGSI_OPCODE_KILL_IF:
5286      exec_kill_if (mach, inst);
5287      break;
5288
5289   case TGSI_OPCODE_PK2H:
5290      exec_pk2h(mach, inst);
5291      break;
5292
5293   case TGSI_OPCODE_PK2US:
5294      assert (0);
5295      break;
5296
5297   case TGSI_OPCODE_PK4B:
5298      assert (0);
5299      break;
5300
5301   case TGSI_OPCODE_PK4UB:
5302      assert (0);
5303      break;
5304
5305   case TGSI_OPCODE_SEQ:
5306      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5307      break;
5308
5309   case TGSI_OPCODE_SGT:
5310      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5311      break;
5312
5313   case TGSI_OPCODE_SIN:
5314      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5315      break;
5316
5317   case TGSI_OPCODE_SLE:
5318      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5319      break;
5320
5321   case TGSI_OPCODE_SNE:
5322      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5323      break;
5324
5325   case TGSI_OPCODE_TEX:
5326      /* simple texture lookup */
5327      /* src[0] = texcoord */
5328      /* src[1] = sampler unit */
5329      exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
5330      break;
5331
5332   case TGSI_OPCODE_TXB:
5333      /* Texture lookup with lod bias */
5334      /* src[0] = texcoord (src[0].w = LOD bias) */
5335      /* src[1] = sampler unit */
5336      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
5337      break;
5338
5339   case TGSI_OPCODE_TXD:
5340      /* Texture lookup with explict partial derivatives */
5341      /* src[0] = texcoord */
5342      /* src[1] = d[strq]/dx */
5343      /* src[2] = d[strq]/dy */
5344      /* src[3] = sampler unit */
5345      exec_txd(mach, inst);
5346      break;
5347
5348   case TGSI_OPCODE_TXL:
5349      /* Texture lookup with explit LOD */
5350      /* src[0] = texcoord (src[0].w = LOD) */
5351      /* src[1] = sampler unit */
5352      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
5353      break;
5354
5355   case TGSI_OPCODE_TXP:
5356      /* Texture lookup with projection */
5357      /* src[0] = texcoord (src[0].w = projection) */
5358      /* src[1] = sampler unit */
5359      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
5360      break;
5361
5362   case TGSI_OPCODE_TG4:
5363      /* src[0] = texcoord */
5364      /* src[1] = component */
5365      /* src[2] = sampler unit */
5366      exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
5367      break;
5368
5369   case TGSI_OPCODE_LODQ:
5370      /* src[0] = texcoord */
5371      /* src[1] = sampler unit */
5372      exec_lodq(mach, inst);
5373      break;
5374
5375   case TGSI_OPCODE_UP2H:
5376      exec_up2h(mach, inst);
5377      break;
5378
5379   case TGSI_OPCODE_UP2US:
5380      assert (0);
5381      break;
5382
5383   case TGSI_OPCODE_UP4B:
5384      assert (0);
5385      break;
5386
5387   case TGSI_OPCODE_UP4UB:
5388      assert (0);
5389      break;
5390
5391   case TGSI_OPCODE_ARR:
5392      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5393      break;
5394
5395   case TGSI_OPCODE_CAL:
5396      /* skip the call if no execution channels are enabled */
5397      if (mach->ExecMask) {
5398         /* do the call */
5399
5400         /* First, record the depths of the execution stacks.
5401          * This is important for deeply nested/looped return statements.
5402          * We have to unwind the stacks by the correct amount.  For a
5403          * real code generator, we could determine the number of entries
5404          * to pop off each stack with simple static analysis and avoid
5405          * implementing this data structure at run time.
5406          */
5407         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
5408         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
5409         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
5410         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
5411         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
5412         /* note that PC was already incremented above */
5413         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
5414
5415         mach->CallStackTop++;
5416
5417         /* Second, push the Cond, Loop, Cont, Func stacks */
5418         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5419         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5420         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5421         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
5422         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5423         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
5424
5425         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5426         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5427         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5428         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
5429         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5430         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
5431
5432         /* Finally, jump to the subroutine.  The label is a pointer
5433          * (an instruction number) to the BGNSUB instruction.
5434          */
5435         *pc = inst->Label.Label;
5436         assert(mach->Instructions[*pc].Instruction.Opcode
5437                == TGSI_OPCODE_BGNSUB);
5438      }
5439      break;
5440
5441   case TGSI_OPCODE_RET:
5442      mach->FuncMask &= ~mach->ExecMask;
5443      UPDATE_EXEC_MASK(mach);
5444
5445      if (mach->FuncMask == 0x0) {
5446         /* really return now (otherwise, keep executing */
5447
5448         if (mach->CallStackTop == 0) {
5449            /* returning from main() */
5450            mach->CondStackTop = 0;
5451            mach->LoopStackTop = 0;
5452            mach->ContStackTop = 0;
5453            mach->LoopLabelStackTop = 0;
5454            mach->SwitchStackTop = 0;
5455            mach->BreakStackTop = 0;
5456            *pc = -1;
5457            return FALSE;
5458         }
5459
5460         assert(mach->CallStackTop > 0);
5461         mach->CallStackTop--;
5462
5463         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5464         mach->CondMask = mach->CondStack[mach->CondStackTop];
5465
5466         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5467         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5468
5469         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5470         mach->ContMask = mach->ContStack[mach->ContStackTop];
5471
5472         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5473         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5474
5475         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5476         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5477
5478         assert(mach->FuncStackTop > 0);
5479         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5480
5481         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5482
5483         UPDATE_EXEC_MASK(mach);
5484      }
5485      break;
5486
5487   case TGSI_OPCODE_SSG:
5488      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5489      break;
5490
5491   case TGSI_OPCODE_CMP:
5492      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5493      break;
5494
5495   case TGSI_OPCODE_SCS:
5496      exec_scs(mach, inst);
5497      break;
5498
5499   case TGSI_OPCODE_DIV:
5500      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5501      break;
5502
5503   case TGSI_OPCODE_DP2:
5504      exec_dp2(mach, inst);
5505      break;
5506
5507   case TGSI_OPCODE_IF:
5508      /* push CondMask */
5509      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5510      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5511      FETCH( &r[0], 0, TGSI_CHAN_X );
5512      /* update CondMask */
5513      if( ! r[0].f[0] ) {
5514         mach->CondMask &= ~0x1;
5515      }
5516      if( ! r[0].f[1] ) {
5517         mach->CondMask &= ~0x2;
5518      }
5519      if( ! r[0].f[2] ) {
5520         mach->CondMask &= ~0x4;
5521      }
5522      if( ! r[0].f[3] ) {
5523         mach->CondMask &= ~0x8;
5524      }
5525      UPDATE_EXEC_MASK(mach);
5526      /* Todo: If CondMask==0, jump to ELSE */
5527      break;
5528
5529   case TGSI_OPCODE_UIF:
5530      /* push CondMask */
5531      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5532      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5533      IFETCH( &r[0], 0, TGSI_CHAN_X );
5534      /* update CondMask */
5535      if( ! r[0].u[0] ) {
5536         mach->CondMask &= ~0x1;
5537      }
5538      if( ! r[0].u[1] ) {
5539         mach->CondMask &= ~0x2;
5540      }
5541      if( ! r[0].u[2] ) {
5542         mach->CondMask &= ~0x4;
5543      }
5544      if( ! r[0].u[3] ) {
5545         mach->CondMask &= ~0x8;
5546      }
5547      UPDATE_EXEC_MASK(mach);
5548      /* Todo: If CondMask==0, jump to ELSE */
5549      break;
5550
5551   case TGSI_OPCODE_ELSE:
5552      /* invert CondMask wrt previous mask */
5553      {
5554         uint prevMask;
5555         assert(mach->CondStackTop > 0);
5556         prevMask = mach->CondStack[mach->CondStackTop - 1];
5557         mach->CondMask = ~mach->CondMask & prevMask;
5558         UPDATE_EXEC_MASK(mach);
5559         /* Todo: If CondMask==0, jump to ENDIF */
5560      }
5561      break;
5562
5563   case TGSI_OPCODE_ENDIF:
5564      /* pop CondMask */
5565      assert(mach->CondStackTop > 0);
5566      mach->CondMask = mach->CondStack[--mach->CondStackTop];
5567      UPDATE_EXEC_MASK(mach);
5568      break;
5569
5570   case TGSI_OPCODE_END:
5571      /* make sure we end primitives which haven't
5572       * been explicitly emitted */
5573      conditional_emit_primitive(mach);
5574      /* halt execution */
5575      *pc = -1;
5576      break;
5577
5578   case TGSI_OPCODE_PUSHA:
5579      assert (0);
5580      break;
5581
5582   case TGSI_OPCODE_POPA:
5583      assert (0);
5584      break;
5585
5586   case TGSI_OPCODE_CEIL:
5587      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5588      break;
5589
5590   case TGSI_OPCODE_I2F:
5591      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
5592      break;
5593
5594   case TGSI_OPCODE_NOT:
5595      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5596      break;
5597
5598   case TGSI_OPCODE_TRUNC:
5599      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5600      break;
5601
5602   case TGSI_OPCODE_SHL:
5603      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5604      break;
5605
5606   case TGSI_OPCODE_AND:
5607      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5608      break;
5609
5610   case TGSI_OPCODE_OR:
5611      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5612      break;
5613
5614   case TGSI_OPCODE_MOD:
5615      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5616      break;
5617
5618   case TGSI_OPCODE_XOR:
5619      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5620      break;
5621
5622   case TGSI_OPCODE_SAD:
5623      assert (0);
5624      break;
5625
5626   case TGSI_OPCODE_TXF:
5627      exec_txf(mach, inst);
5628      break;
5629
5630   case TGSI_OPCODE_TXQ:
5631      exec_txq(mach, inst);
5632      break;
5633
5634   case TGSI_OPCODE_EMIT:
5635      emit_vertex(mach);
5636      break;
5637
5638   case TGSI_OPCODE_ENDPRIM:
5639      emit_primitive(mach);
5640      break;
5641
5642   case TGSI_OPCODE_BGNLOOP:
5643      /* push LoopMask and ContMasks */
5644      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5645      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5646      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5647      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5648
5649      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5650      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5651      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
5652      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5653      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
5654      break;
5655
5656   case TGSI_OPCODE_ENDLOOP:
5657      /* Restore ContMask, but don't pop */
5658      assert(mach->ContStackTop > 0);
5659      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
5660      UPDATE_EXEC_MASK(mach);
5661      if (mach->ExecMask) {
5662         /* repeat loop: jump to instruction just past BGNLOOP */
5663         assert(mach->LoopLabelStackTop > 0);
5664         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
5665      }
5666      else {
5667         /* exit loop: pop LoopMask */
5668         assert(mach->LoopStackTop > 0);
5669         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
5670         /* pop ContMask */
5671         assert(mach->ContStackTop > 0);
5672         mach->ContMask = mach->ContStack[--mach->ContStackTop];
5673         assert(mach->LoopLabelStackTop > 0);
5674         --mach->LoopLabelStackTop;
5675
5676         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
5677      }
5678      UPDATE_EXEC_MASK(mach);
5679      break;
5680
5681   case TGSI_OPCODE_BRK:
5682      exec_break(mach);
5683      break;
5684
5685   case TGSI_OPCODE_CONT:
5686      /* turn off cont channels for each enabled exec channel */
5687      mach->ContMask &= ~mach->ExecMask;
5688      /* Todo: if mach->LoopMask == 0, jump to end of loop */
5689      UPDATE_EXEC_MASK(mach);
5690      break;
5691
5692   case TGSI_OPCODE_BGNSUB:
5693      /* no-op */
5694      break;
5695
5696   case TGSI_OPCODE_ENDSUB:
5697      /*
5698       * XXX: This really should be a no-op. We should never reach this opcode.
5699       */
5700
5701      assert(mach->CallStackTop > 0);
5702      mach->CallStackTop--;
5703
5704      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5705      mach->CondMask = mach->CondStack[mach->CondStackTop];
5706
5707      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5708      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5709
5710      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5711      mach->ContMask = mach->ContStack[mach->ContStackTop];
5712
5713      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5714      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5715
5716      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5717      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5718
5719      assert(mach->FuncStackTop > 0);
5720      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5721
5722      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5723
5724      UPDATE_EXEC_MASK(mach);
5725      break;
5726
5727   case TGSI_OPCODE_NOP:
5728      break;
5729
5730   case TGSI_OPCODE_BREAKC:
5731      IFETCH(&r[0], 0, TGSI_CHAN_X);
5732      /* update CondMask */
5733      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
5734         mach->LoopMask &= ~0x1;
5735      }
5736      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
5737         mach->LoopMask &= ~0x2;
5738      }
5739      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
5740         mach->LoopMask &= ~0x4;
5741      }
5742      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
5743         mach->LoopMask &= ~0x8;
5744      }
5745      /* Todo: if mach->LoopMask == 0, jump to end of loop */
5746      UPDATE_EXEC_MASK(mach);
5747      break;
5748
5749   case TGSI_OPCODE_F2I:
5750      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5751      break;
5752
5753   case TGSI_OPCODE_FSEQ:
5754      exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5755      break;
5756
5757   case TGSI_OPCODE_FSGE:
5758      exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5759      break;
5760
5761   case TGSI_OPCODE_FSLT:
5762      exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5763      break;
5764
5765   case TGSI_OPCODE_FSNE:
5766      exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5767      break;
5768
5769   case TGSI_OPCODE_IDIV:
5770      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5771      break;
5772
5773   case TGSI_OPCODE_IMAX:
5774      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5775      break;
5776
5777   case TGSI_OPCODE_IMIN:
5778      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5779      break;
5780
5781   case TGSI_OPCODE_INEG:
5782      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5783      break;
5784
5785   case TGSI_OPCODE_ISGE:
5786      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5787      break;
5788
5789   case TGSI_OPCODE_ISHR:
5790      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5791      break;
5792
5793   case TGSI_OPCODE_ISLT:
5794      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5795      break;
5796
5797   case TGSI_OPCODE_F2U:
5798      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5799      break;
5800
5801   case TGSI_OPCODE_U2F:
5802      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
5803      break;
5804
5805   case TGSI_OPCODE_UADD:
5806      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5807      break;
5808
5809   case TGSI_OPCODE_UDIV:
5810      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5811      break;
5812
5813   case TGSI_OPCODE_UMAD:
5814      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5815      break;
5816
5817   case TGSI_OPCODE_UMAX:
5818      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5819      break;
5820
5821   case TGSI_OPCODE_UMIN:
5822      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5823      break;
5824
5825   case TGSI_OPCODE_UMOD:
5826      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5827      break;
5828
5829   case TGSI_OPCODE_UMUL:
5830      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5831      break;
5832
5833   case TGSI_OPCODE_IMUL_HI:
5834      exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5835      break;
5836
5837   case TGSI_OPCODE_UMUL_HI:
5838      exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5839      break;
5840
5841   case TGSI_OPCODE_USEQ:
5842      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5843      break;
5844
5845   case TGSI_OPCODE_USGE:
5846      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5847      break;
5848
5849   case TGSI_OPCODE_USHR:
5850      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5851      break;
5852
5853   case TGSI_OPCODE_USLT:
5854      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5855      break;
5856
5857   case TGSI_OPCODE_USNE:
5858      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5859      break;
5860
5861   case TGSI_OPCODE_SWITCH:
5862      exec_switch(mach, inst);
5863      break;
5864
5865   case TGSI_OPCODE_CASE:
5866      exec_case(mach, inst);
5867      break;
5868
5869   case TGSI_OPCODE_DEFAULT:
5870      exec_default(mach);
5871      break;
5872
5873   case TGSI_OPCODE_ENDSWITCH:
5874      exec_endswitch(mach);
5875      break;
5876
5877   case TGSI_OPCODE_SAMPLE_I:
5878      exec_txf(mach, inst);
5879      break;
5880
5881   case TGSI_OPCODE_SAMPLE_I_MS:
5882      exec_txf(mach, inst);
5883      break;
5884
5885   case TGSI_OPCODE_SAMPLE:
5886      exec_sample(mach, inst, TEX_MODIFIER_NONE, FALSE);
5887      break;
5888
5889   case TGSI_OPCODE_SAMPLE_B:
5890      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, FALSE);
5891      break;
5892
5893   case TGSI_OPCODE_SAMPLE_C:
5894      exec_sample(mach, inst, TEX_MODIFIER_NONE, TRUE);
5895      break;
5896
5897   case TGSI_OPCODE_SAMPLE_C_LZ:
5898      exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, TRUE);
5899      break;
5900
5901   case TGSI_OPCODE_SAMPLE_D:
5902      exec_sample_d(mach, inst);
5903      break;
5904
5905   case TGSI_OPCODE_SAMPLE_L:
5906      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, FALSE);
5907      break;
5908
5909   case TGSI_OPCODE_GATHER4:
5910      assert(0);
5911      break;
5912
5913   case TGSI_OPCODE_SVIEWINFO:
5914      exec_txq(mach, inst);
5915      break;
5916
5917   case TGSI_OPCODE_SAMPLE_POS:
5918      assert(0);
5919      break;
5920
5921   case TGSI_OPCODE_SAMPLE_INFO:
5922      assert(0);
5923      break;
5924
5925   case TGSI_OPCODE_UARL:
5926      exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5927      break;
5928
5929   case TGSI_OPCODE_UCMP:
5930      exec_vector_trinary(mach, inst, micro_ucmp, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5931      break;
5932
5933   case TGSI_OPCODE_IABS:
5934      exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5935      break;
5936
5937   case TGSI_OPCODE_ISSG:
5938      exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5939      break;
5940
5941   case TGSI_OPCODE_TEX2:
5942      /* simple texture lookup */
5943      /* src[0] = texcoord */
5944      /* src[1] = compare */
5945      /* src[2] = sampler unit */
5946      exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
5947      break;
5948   case TGSI_OPCODE_TXB2:
5949      /* simple texture lookup */
5950      /* src[0] = texcoord */
5951      /* src[1] = bias */
5952      /* src[2] = sampler unit */
5953      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
5954      break;
5955   case TGSI_OPCODE_TXL2:
5956      /* simple texture lookup */
5957      /* src[0] = texcoord */
5958      /* src[1] = lod */
5959      /* src[2] = sampler unit */
5960      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
5961      break;
5962
5963   case TGSI_OPCODE_IBFE:
5964      exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5965      break;
5966   case TGSI_OPCODE_UBFE:
5967      exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5968      break;
5969   case TGSI_OPCODE_BFI:
5970      exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5971      break;
5972   case TGSI_OPCODE_BREV:
5973      exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5974      break;
5975   case TGSI_OPCODE_POPC:
5976      exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5977      break;
5978   case TGSI_OPCODE_LSB:
5979      exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5980      break;
5981   case TGSI_OPCODE_IMSB:
5982      exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5983      break;
5984   case TGSI_OPCODE_UMSB:
5985      exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5986      break;
5987
5988   case TGSI_OPCODE_F2D:
5989      exec_t_2_64(mach, inst, micro_f2d, TGSI_EXEC_DATA_FLOAT);
5990      break;
5991
5992   case TGSI_OPCODE_D2F:
5993      exec_64_2_t(mach, inst, micro_d2f, TGSI_EXEC_DATA_FLOAT);
5994      break;
5995
5996   case TGSI_OPCODE_DABS:
5997      exec_double_unary(mach, inst, micro_dabs);
5998      break;
5999
6000   case TGSI_OPCODE_DNEG:
6001      exec_double_unary(mach, inst, micro_dneg);
6002      break;
6003
6004   case TGSI_OPCODE_DADD:
6005      exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
6006      break;
6007
6008   case TGSI_OPCODE_DDIV:
6009      exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
6010      break;
6011
6012   case TGSI_OPCODE_DMUL:
6013      exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
6014      break;
6015
6016   case TGSI_OPCODE_DMAX:
6017      exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
6018      break;
6019
6020   case TGSI_OPCODE_DMIN:
6021      exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
6022      break;
6023
6024   case TGSI_OPCODE_DSLT:
6025      exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
6026      break;
6027
6028   case TGSI_OPCODE_DSGE:
6029      exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
6030      break;
6031
6032   case TGSI_OPCODE_DSEQ:
6033      exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
6034      break;
6035
6036   case TGSI_OPCODE_DSNE:
6037      exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
6038      break;
6039
6040   case TGSI_OPCODE_DRCP:
6041      exec_double_unary(mach, inst, micro_drcp);
6042      break;
6043
6044   case TGSI_OPCODE_DSQRT:
6045      exec_double_unary(mach, inst, micro_dsqrt);
6046      break;
6047
6048   case TGSI_OPCODE_DRSQ:
6049      exec_double_unary(mach, inst, micro_drsq);
6050      break;
6051
6052   case TGSI_OPCODE_DMAD:
6053      exec_double_trinary(mach, inst, micro_dmad);
6054      break;
6055
6056   case TGSI_OPCODE_DFRAC:
6057      exec_double_unary(mach, inst, micro_dfrac);
6058      break;
6059
6060   case TGSI_OPCODE_DLDEXP:
6061      exec_dldexp(mach, inst);
6062      break;
6063
6064   case TGSI_OPCODE_DFRACEXP:
6065      exec_dfracexp(mach, inst);
6066      break;
6067
6068   case TGSI_OPCODE_I2D:
6069      exec_t_2_64(mach, inst, micro_i2d, TGSI_EXEC_DATA_INT);
6070      break;
6071
6072   case TGSI_OPCODE_D2I:
6073      exec_64_2_t(mach, inst, micro_d2i, TGSI_EXEC_DATA_INT);
6074      break;
6075
6076   case TGSI_OPCODE_U2D:
6077      exec_t_2_64(mach, inst, micro_u2d, TGSI_EXEC_DATA_UINT);
6078      break;
6079
6080   case TGSI_OPCODE_D2U:
6081      exec_64_2_t(mach, inst, micro_d2u, TGSI_EXEC_DATA_INT);
6082      break;
6083
6084   case TGSI_OPCODE_LOAD:
6085      exec_load(mach, inst);
6086      break;
6087
6088   case TGSI_OPCODE_STORE:
6089      exec_store(mach, inst);
6090      break;
6091
6092   case TGSI_OPCODE_ATOMUADD:
6093   case TGSI_OPCODE_ATOMXCHG:
6094   case TGSI_OPCODE_ATOMCAS:
6095   case TGSI_OPCODE_ATOMAND:
6096   case TGSI_OPCODE_ATOMOR:
6097   case TGSI_OPCODE_ATOMXOR:
6098   case TGSI_OPCODE_ATOMUMIN:
6099   case TGSI_OPCODE_ATOMUMAX:
6100   case TGSI_OPCODE_ATOMIMIN:
6101   case TGSI_OPCODE_ATOMIMAX:
6102      exec_atomop(mach, inst);
6103      break;
6104
6105   case TGSI_OPCODE_RESQ:
6106      exec_resq(mach, inst);
6107      break;
6108   case TGSI_OPCODE_BARRIER:
6109   case TGSI_OPCODE_MEMBAR:
6110      return TRUE;
6111      break;
6112
6113   case TGSI_OPCODE_I64ABS:
6114      exec_double_unary(mach, inst, micro_i64abs);
6115      break;
6116
6117   case TGSI_OPCODE_I64SSG:
6118      exec_double_unary(mach, inst, micro_i64sgn);
6119      break;
6120
6121   case TGSI_OPCODE_I64NEG:
6122      exec_double_unary(mach, inst, micro_i64neg);
6123      break;
6124
6125   case TGSI_OPCODE_U64SEQ:
6126      exec_double_binary(mach, inst, micro_u64seq, TGSI_EXEC_DATA_UINT);
6127      break;
6128
6129   case TGSI_OPCODE_U64SNE:
6130      exec_double_binary(mach, inst, micro_u64sne, TGSI_EXEC_DATA_UINT);
6131      break;
6132
6133   case TGSI_OPCODE_I64SLT:
6134      exec_double_binary(mach, inst, micro_i64slt, TGSI_EXEC_DATA_UINT);
6135      break;
6136   case TGSI_OPCODE_U64SLT:
6137      exec_double_binary(mach, inst, micro_u64slt, TGSI_EXEC_DATA_UINT);
6138      break;
6139
6140   case TGSI_OPCODE_I64SGE:
6141      exec_double_binary(mach, inst, micro_i64sge, TGSI_EXEC_DATA_UINT);
6142      break;
6143   case TGSI_OPCODE_U64SGE:
6144      exec_double_binary(mach, inst, micro_u64sge, TGSI_EXEC_DATA_UINT);
6145      break;
6146
6147   case TGSI_OPCODE_I64MIN:
6148      exec_double_binary(mach, inst, micro_i64min, TGSI_EXEC_DATA_INT64);
6149      break;
6150   case TGSI_OPCODE_U64MIN:
6151      exec_double_binary(mach, inst, micro_u64min, TGSI_EXEC_DATA_UINT64);
6152      break;
6153   case TGSI_OPCODE_I64MAX:
6154      exec_double_binary(mach, inst, micro_i64max, TGSI_EXEC_DATA_INT64);
6155      break;
6156   case TGSI_OPCODE_U64MAX:
6157      exec_double_binary(mach, inst, micro_u64max, TGSI_EXEC_DATA_UINT64);
6158      break;
6159   case TGSI_OPCODE_U64ADD:
6160      exec_double_binary(mach, inst, micro_u64add, TGSI_EXEC_DATA_UINT64);
6161      break;
6162   case TGSI_OPCODE_U64MUL:
6163      exec_double_binary(mach, inst, micro_u64mul, TGSI_EXEC_DATA_UINT64);
6164      break;
6165   case TGSI_OPCODE_U64SHL:
6166      exec_arg0_64_arg1_32(mach, inst, micro_u64shl);
6167      break;
6168   case TGSI_OPCODE_I64SHR:
6169      exec_arg0_64_arg1_32(mach, inst, micro_i64shr);
6170      break;
6171   case TGSI_OPCODE_U64SHR:
6172      exec_arg0_64_arg1_32(mach, inst, micro_u64shr);
6173      break;
6174   case TGSI_OPCODE_U64DIV:
6175      exec_double_binary(mach, inst, micro_u64div, TGSI_EXEC_DATA_UINT64);
6176      break;
6177   case TGSI_OPCODE_I64DIV:
6178      exec_double_binary(mach, inst, micro_i64div, TGSI_EXEC_DATA_INT64);
6179      break;
6180   case TGSI_OPCODE_U64MOD:
6181      exec_double_binary(mach, inst, micro_u64mod, TGSI_EXEC_DATA_UINT64);
6182      break;
6183   case TGSI_OPCODE_I64MOD:
6184      exec_double_binary(mach, inst, micro_i64mod, TGSI_EXEC_DATA_INT64);
6185      break;
6186
6187   case TGSI_OPCODE_F2U64:
6188      exec_t_2_64(mach, inst, micro_f2u64, TGSI_EXEC_DATA_FLOAT);
6189      break;
6190
6191   case TGSI_OPCODE_F2I64:
6192      exec_t_2_64(mach, inst, micro_f2i64, TGSI_EXEC_DATA_FLOAT);
6193      break;
6194
6195   case TGSI_OPCODE_U2I64:
6196      exec_t_2_64(mach, inst, micro_u2i64, TGSI_EXEC_DATA_INT);
6197      break;
6198   case TGSI_OPCODE_I2I64:
6199      exec_t_2_64(mach, inst, micro_i2i64, TGSI_EXEC_DATA_INT);
6200      break;
6201
6202   case TGSI_OPCODE_D2U64:
6203      exec_double_unary(mach, inst, micro_d2u64);
6204      break;
6205
6206   case TGSI_OPCODE_D2I64:
6207      exec_double_unary(mach, inst, micro_d2i64);
6208      break;
6209
6210   case TGSI_OPCODE_U642F:
6211      exec_64_2_t(mach, inst, micro_u642f, TGSI_EXEC_DATA_FLOAT);
6212      break;
6213   case TGSI_OPCODE_I642F:
6214      exec_64_2_t(mach, inst, micro_i642f, TGSI_EXEC_DATA_FLOAT);
6215      break;
6216
6217   case TGSI_OPCODE_U642D:
6218      exec_double_unary(mach, inst, micro_u642d);
6219      break;
6220   case TGSI_OPCODE_I642D:
6221      exec_double_unary(mach, inst, micro_i642d);
6222      break;
6223
6224   default:
6225      assert( 0 );
6226   }
6227   return FALSE;
6228}
6229
6230static void
6231tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
6232{
6233   uint default_mask = 0xf;
6234
6235   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
6236   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
6237
6238   if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
6239      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
6240      mach->Primitives[0] = 0;
6241      /* GS runs on a single primitive for now */
6242      default_mask = 0x1;
6243   }
6244
6245   if (mach->NonHelperMask == 0)
6246      mach->NonHelperMask = default_mask;
6247   mach->CondMask = default_mask;
6248   mach->LoopMask = default_mask;
6249   mach->ContMask = default_mask;
6250   mach->FuncMask = default_mask;
6251   mach->ExecMask = default_mask;
6252
6253   mach->Switch.mask = default_mask;
6254
6255   assert(mach->CondStackTop == 0);
6256   assert(mach->LoopStackTop == 0);
6257   assert(mach->ContStackTop == 0);
6258   assert(mach->SwitchStackTop == 0);
6259   assert(mach->BreakStackTop == 0);
6260   assert(mach->CallStackTop == 0);
6261}
6262
6263/**
6264 * Run TGSI interpreter.
6265 * \return bitmask of "alive" quad components
6266 */
6267uint
6268tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
6269{
6270   uint i;
6271
6272   mach->pc = start_pc;
6273
6274   if (!start_pc) {
6275      tgsi_exec_machine_setup_masks(mach);
6276
6277      /* execute declarations (interpolants) */
6278      for (i = 0; i < mach->NumDeclarations; i++) {
6279         exec_declaration( mach, mach->Declarations+i );
6280      }
6281   }
6282
6283   {
6284#if DEBUG_EXECUTION
6285      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
6286      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
6287      uint inst = 1;
6288
6289      if (!start_pc) {
6290         memset(mach->Temps, 0, sizeof(temps));
6291         if (mach->Outputs)
6292            memset(mach->Outputs, 0, sizeof(outputs));
6293         memset(temps, 0, sizeof(temps));
6294         memset(outputs, 0, sizeof(outputs));
6295      }
6296#endif
6297
6298      /* execute instructions, until pc is set to -1 */
6299      while (mach->pc != -1) {
6300         boolean barrier_hit;
6301#if DEBUG_EXECUTION
6302         uint i;
6303
6304         tgsi_dump_instruction(&mach->Instructions[mach->pc], inst++);
6305#endif
6306
6307         assert(mach->pc < (int) mach->NumInstructions);
6308         barrier_hit = exec_instruction(mach, mach->Instructions + mach->pc, &mach->pc);
6309
6310         /* for compute shaders if we hit a barrier return now for later rescheduling */
6311         if (barrier_hit && mach->ShaderType == PIPE_SHADER_COMPUTE)
6312            return 0;
6313
6314#if DEBUG_EXECUTION
6315         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
6316            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
6317               uint j;
6318
6319               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
6320               debug_printf("TEMP[%2u] = ", i);
6321               for (j = 0; j < 4; j++) {
6322                  if (j > 0) {
6323                     debug_printf("           ");
6324                  }
6325                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6326                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
6327                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
6328                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
6329                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
6330               }
6331            }
6332         }
6333         if (mach->Outputs) {
6334            for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
6335               if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
6336                  uint j;
6337
6338                  memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
6339                  debug_printf("OUT[%2u] =  ", i);
6340                  for (j = 0; j < 4; j++) {
6341                     if (j > 0) {
6342                        debug_printf("           ");
6343                     }
6344                     debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6345                                  outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
6346                                  outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
6347                                  outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
6348                                  outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
6349                  }
6350               }
6351            }
6352         }
6353#endif
6354      }
6355   }
6356
6357#if 0
6358   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
6359   if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
6360      /*
6361       * Scale back depth component.
6362       */
6363      for (i = 0; i < 4; i++)
6364         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
6365   }
6366#endif
6367
6368   /* Strictly speaking, these assertions aren't really needed but they
6369    * can potentially catch some bugs in the control flow code.
6370    */
6371   assert(mach->CondStackTop == 0);
6372   assert(mach->LoopStackTop == 0);
6373   assert(mach->ContStackTop == 0);
6374   assert(mach->SwitchStackTop == 0);
6375   assert(mach->BreakStackTop == 0);
6376   assert(mach->CallStackTop == 0);
6377
6378   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
6379}
6380