lp_bld_tgsi_soa.c revision 3d5b9c1f2d3340259dd0d8765090a5a963074f29
1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * @file
31 * TGSI to LLVM IR translation -- SoA.
32 *
33 * @author Jose Fonseca <jfonseca@vmware.com>
34 *
35 * Based on tgsi_sse2.c code written by Michal Krol, Keith Whitwell,
36 * Brian Paul, and others.
37 */
38
39#include "pipe/p_config.h"
40#include "pipe/p_shader_tokens.h"
41#include "util/u_debug.h"
42#include "util/u_math.h"
43#include "util/u_memory.h"
44#include "tgsi/tgsi_dump.h"
45#include "tgsi/tgsi_info.h"
46#include "tgsi/tgsi_parse.h"
47#include "tgsi/tgsi_util.h"
48#include "tgsi/tgsi_scan.h"
49#include "lp_bld_type.h"
50#include "lp_bld_const.h"
51#include "lp_bld_arit.h"
52#include "lp_bld_gather.h"
53#include "lp_bld_logic.h"
54#include "lp_bld_swizzle.h"
55#include "lp_bld_flow.h"
56#include "lp_bld_quad.h"
57#include "lp_bld_tgsi.h"
58#include "lp_bld_limits.h"
59#include "lp_bld_debug.h"
60
61
62#define FOR_EACH_CHANNEL( CHAN )\
63   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
64
65#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
66   ((INST)->Dst[0].Register.WriteMask & (1 << (CHAN)))
67
68#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
69   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
70
71#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
72   FOR_EACH_CHANNEL( CHAN )\
73      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
74
75#define CHAN_X 0
76#define CHAN_Y 1
77#define CHAN_Z 2
78#define CHAN_W 3
79#define NUM_CHANNELS 4
80
81#define LP_MAX_INSTRUCTIONS 256
82
83
84struct lp_exec_mask {
85   struct lp_build_context *bld;
86
87   boolean has_mask;
88
89   LLVMTypeRef int_vec_type;
90
91   LLVMValueRef cond_stack[LP_MAX_TGSI_NESTING];
92   int cond_stack_size;
93   LLVMValueRef cond_mask;
94
95   LLVMBasicBlockRef loop_block;
96   LLVMValueRef cont_mask;
97   LLVMValueRef break_mask;
98   LLVMValueRef break_var;
99   struct {
100      LLVMBasicBlockRef loop_block;
101      LLVMValueRef cont_mask;
102      LLVMValueRef break_mask;
103      LLVMValueRef break_var;
104   } loop_stack[LP_MAX_TGSI_NESTING];
105   int loop_stack_size;
106
107   LLVMValueRef ret_mask;
108   struct {
109      int pc;
110      LLVMValueRef ret_mask;
111   } call_stack[LP_MAX_TGSI_NESTING];
112   int call_stack_size;
113
114   LLVMValueRef exec_mask;
115};
116
117struct lp_build_tgsi_soa_context
118{
119   struct lp_build_context base;
120
121   /* Builder for integer masks and indices */
122   struct lp_build_context int_bld;
123
124   LLVMValueRef consts_ptr;
125   const LLVMValueRef *pos;
126   const LLVMValueRef (*inputs)[NUM_CHANNELS];
127   LLVMValueRef (*outputs)[NUM_CHANNELS];
128
129   const struct lp_build_sampler_soa *sampler;
130
131   LLVMValueRef immediates[LP_MAX_TGSI_IMMEDIATES][NUM_CHANNELS];
132   LLVMValueRef temps[LP_MAX_TGSI_TEMPS][NUM_CHANNELS];
133   LLVMValueRef addr[LP_MAX_TGSI_ADDRS][NUM_CHANNELS];
134   LLVMValueRef preds[LP_MAX_TGSI_PREDS][NUM_CHANNELS];
135
136   /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
137    * set in the indirect_files field.
138    * The temps[] array above is unused then.
139    */
140   LLVMValueRef temps_array;
141
142   /** bitmask indicating which register files are accessed indirectly */
143   unsigned indirect_files;
144
145   struct lp_build_mask_context *mask;
146   struct lp_exec_mask exec_mask;
147
148   struct tgsi_full_instruction *instructions;
149   uint max_instructions;
150};
151
152static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
153{
154   mask->bld = bld;
155   mask->has_mask = FALSE;
156   mask->cond_stack_size = 0;
157   mask->loop_stack_size = 0;
158   mask->call_stack_size = 0;
159
160   mask->int_vec_type = lp_build_int_vec_type(mask->bld->type);
161   mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask = mask->cond_mask =
162         LLVMConstAllOnes(mask->int_vec_type);
163}
164
165static void lp_exec_mask_update(struct lp_exec_mask *mask)
166{
167   if (mask->loop_stack_size) {
168      /*for loops we need to update the entire mask at runtime */
169      LLVMValueRef tmp;
170      assert(mask->break_mask);
171      tmp = LLVMBuildAnd(mask->bld->builder,
172                         mask->cont_mask,
173                         mask->break_mask,
174                         "maskcb");
175      mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
176                                     mask->cond_mask,
177                                     tmp,
178                                     "maskfull");
179   } else
180      mask->exec_mask = mask->cond_mask;
181
182   if (mask->call_stack_size) {
183      mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
184                                     mask->exec_mask,
185                                     mask->ret_mask,
186                                     "callmask");
187   }
188
189   mask->has_mask = (mask->cond_stack_size > 0 ||
190                     mask->loop_stack_size > 0 ||
191                     mask->call_stack_size > 0);
192}
193
194static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
195                                   LLVMValueRef val)
196{
197   assert(mask->cond_stack_size < LP_MAX_TGSI_NESTING);
198   if (mask->cond_stack_size == 0) {
199      assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type));
200   }
201   mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
202   assert(LLVMTypeOf(val) == mask->int_vec_type);
203   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
204                                  mask->cond_mask,
205                                  val,
206                                  "");
207   lp_exec_mask_update(mask);
208}
209
210static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
211{
212   LLVMValueRef prev_mask;
213   LLVMValueRef inv_mask;
214
215   assert(mask->cond_stack_size);
216   prev_mask = mask->cond_stack[mask->cond_stack_size - 1];
217   if (mask->cond_stack_size == 1) {
218      assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type));
219   }
220
221   inv_mask = LLVMBuildNot(mask->bld->builder, mask->cond_mask, "");
222
223   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
224                                  inv_mask,
225                                  prev_mask, "");
226   lp_exec_mask_update(mask);
227}
228
229static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
230{
231   assert(mask->cond_stack_size);
232   mask->cond_mask = mask->cond_stack[--mask->cond_stack_size];
233   lp_exec_mask_update(mask);
234}
235
236static void lp_exec_bgnloop(struct lp_exec_mask *mask)
237{
238   if (mask->loop_stack_size == 0) {
239      assert(mask->loop_block == NULL);
240      assert(mask->cont_mask == LLVMConstAllOnes(mask->int_vec_type));
241      assert(mask->break_mask == LLVMConstAllOnes(mask->int_vec_type));
242      assert(mask->break_var == NULL);
243   }
244
245   assert(mask->loop_stack_size < LP_MAX_TGSI_NESTING);
246
247   mask->loop_stack[mask->loop_stack_size].loop_block = mask->loop_block;
248   mask->loop_stack[mask->loop_stack_size].cont_mask = mask->cont_mask;
249   mask->loop_stack[mask->loop_stack_size].break_mask = mask->break_mask;
250   mask->loop_stack[mask->loop_stack_size].break_var = mask->break_var;
251   ++mask->loop_stack_size;
252
253   mask->break_var = lp_build_alloca(mask->bld->builder, mask->int_vec_type, "");
254   LLVMBuildStore(mask->bld->builder, mask->break_mask, mask->break_var);
255
256   mask->loop_block = lp_build_insert_new_block(mask->bld->builder, "bgnloop");
257   LLVMBuildBr(mask->bld->builder, mask->loop_block);
258   LLVMPositionBuilderAtEnd(mask->bld->builder, mask->loop_block);
259
260   mask->break_mask = LLVMBuildLoad(mask->bld->builder, mask->break_var, "");
261
262   lp_exec_mask_update(mask);
263}
264
265static void lp_exec_break(struct lp_exec_mask *mask)
266{
267   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
268                                         mask->exec_mask,
269                                         "break");
270
271   mask->break_mask = LLVMBuildAnd(mask->bld->builder,
272                                   mask->break_mask,
273                                   exec_mask, "break_full");
274
275   lp_exec_mask_update(mask);
276}
277
278static void lp_exec_continue(struct lp_exec_mask *mask)
279{
280   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
281                                         mask->exec_mask,
282                                         "");
283
284   mask->cont_mask = LLVMBuildAnd(mask->bld->builder,
285                                  mask->cont_mask,
286                                  exec_mask, "");
287
288   lp_exec_mask_update(mask);
289}
290
291
292static void lp_exec_endloop(struct lp_exec_mask *mask)
293{
294   LLVMBasicBlockRef endloop;
295   LLVMTypeRef reg_type = LLVMIntType(mask->bld->type.width*
296                                      mask->bld->type.length);
297   LLVMValueRef i1cond;
298
299   assert(mask->break_mask);
300
301   /*
302    * Restore the cont_mask, but don't pop
303    */
304   assert(mask->loop_stack_size);
305   mask->cont_mask = mask->loop_stack[mask->loop_stack_size - 1].cont_mask;
306   lp_exec_mask_update(mask);
307
308   /*
309    * Unlike the continue mask, the break_mask must be preserved across loop
310    * iterations
311    */
312   LLVMBuildStore(mask->bld->builder, mask->break_mask, mask->break_var);
313
314   /* i1cond = (mask == 0) */
315   i1cond = LLVMBuildICmp(
316      mask->bld->builder,
317      LLVMIntNE,
318      LLVMBuildBitCast(mask->bld->builder, mask->exec_mask, reg_type, ""),
319      LLVMConstNull(reg_type), "");
320
321   endloop = lp_build_insert_new_block(mask->bld->builder, "endloop");
322
323   LLVMBuildCondBr(mask->bld->builder,
324                   i1cond, mask->loop_block, endloop);
325
326   LLVMPositionBuilderAtEnd(mask->bld->builder, endloop);
327
328   assert(mask->loop_stack_size);
329   --mask->loop_stack_size;
330   mask->loop_block = mask->loop_stack[mask->loop_stack_size].loop_block;
331   mask->cont_mask = mask->loop_stack[mask->loop_stack_size].cont_mask;
332   mask->break_mask = mask->loop_stack[mask->loop_stack_size].break_mask;
333   mask->break_var = mask->loop_stack[mask->loop_stack_size].break_var;
334
335   lp_exec_mask_update(mask);
336}
337
338/* stores val into an address pointed to by dst.
339 * mask->exec_mask is used to figure out which bits of val
340 * should be stored into the address
341 * (0 means don't store this bit, 1 means do store).
342 */
343static void lp_exec_mask_store(struct lp_exec_mask *mask,
344                               LLVMValueRef pred,
345                               LLVMValueRef val,
346                               LLVMValueRef dst)
347{
348   /* Mix the predicate and execution mask */
349   if (mask->has_mask) {
350      if (pred) {
351         pred = LLVMBuildAnd(mask->bld->builder, pred, mask->exec_mask, "");
352      } else {
353         pred = mask->exec_mask;
354      }
355   }
356
357   if (pred) {
358      LLVMValueRef real_val, dst_val;
359
360      dst_val = LLVMBuildLoad(mask->bld->builder, dst, "");
361      real_val = lp_build_select(mask->bld,
362                                 pred,
363                                 val, dst_val);
364
365      LLVMBuildStore(mask->bld->builder, real_val, dst);
366   } else
367      LLVMBuildStore(mask->bld->builder, val, dst);
368}
369
370static void lp_exec_mask_call(struct lp_exec_mask *mask,
371                              int func,
372                              int *pc)
373{
374   assert(mask->call_stack_size < LP_MAX_TGSI_NESTING);
375   mask->call_stack[mask->call_stack_size].pc = *pc;
376   mask->call_stack[mask->call_stack_size].ret_mask = mask->ret_mask;
377   mask->call_stack_size++;
378   *pc = func;
379}
380
381static void lp_exec_mask_ret(struct lp_exec_mask *mask, int *pc)
382{
383   LLVMValueRef exec_mask;
384
385   if (mask->call_stack_size == 0) {
386      /* returning from main() */
387      *pc = -1;
388      return;
389   }
390   exec_mask = LLVMBuildNot(mask->bld->builder,
391                            mask->exec_mask,
392                            "ret");
393
394   mask->ret_mask = LLVMBuildAnd(mask->bld->builder,
395                                 mask->ret_mask,
396                                 exec_mask, "ret_full");
397
398   lp_exec_mask_update(mask);
399}
400
401static void lp_exec_mask_bgnsub(struct lp_exec_mask *mask)
402{
403}
404
405static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
406{
407   assert(mask->call_stack_size);
408   mask->call_stack_size--;
409   *pc = mask->call_stack[mask->call_stack_size].pc;
410   mask->ret_mask = mask->call_stack[mask->call_stack_size].ret_mask;
411   lp_exec_mask_update(mask);
412}
413
414
415/**
416 * Return pointer to a temporary register channel (src or dest).
417 * Note that indirect addressing cannot be handled here.
418 * \param index  which temporary register
419 * \param chan  which channel of the temp register.
420 */
421static LLVMValueRef
422get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
423             unsigned index,
424             unsigned chan)
425{
426   assert(chan < 4);
427   if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
428      LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan);
429      return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, "");
430   }
431   else {
432      return bld->temps[index][chan];
433   }
434}
435
436
437/**
438 * Gather vector.
439 * XXX the lp_build_gather() function should be capable of doing this
440 * with a little work.
441 */
442static LLVMValueRef
443build_gather(struct lp_build_tgsi_soa_context *bld,
444             LLVMValueRef base_ptr,
445             LLVMValueRef indexes)
446{
447   LLVMValueRef res = bld->base.undef;
448   unsigned i;
449
450   /*
451    * Loop over elements of index_vec, load scalar value, insert it into 'res'.
452    */
453   for (i = 0; i < bld->base.type.length; i++) {
454      LLVMValueRef ii = LLVMConstInt(LLVMInt32Type(), i, 0);
455      LLVMValueRef index = LLVMBuildExtractElement(bld->base.builder,
456                                                   indexes, ii, "");
457      LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, base_ptr,
458                                             &index, 1, "");
459      LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
460
461      res = LLVMBuildInsertElement(bld->base.builder, res, scalar, ii, "");
462   }
463
464   return res;
465}
466
467
468/**
469 * Read the current value of the ADDR register, convert the floats to
470 * ints, multiply by four and return the vector of offsets.
471 * The offsets will be used to index into the constant buffer or
472 * temporary register file.
473 */
474static LLVMValueRef
475get_indirect_offsets(struct lp_build_tgsi_soa_context *bld,
476                     const struct tgsi_src_register *indirect_reg)
477{
478   /* always use X component of address register */
479   unsigned swizzle = indirect_reg->SwizzleX;
480   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
481   LLVMValueRef vec4 = lp_build_const_int_vec(bld->int_bld.type, 4);
482   LLVMValueRef addr_vec;
483
484   addr_vec = LLVMBuildLoad(bld->base.builder,
485                            bld->addr[indirect_reg->Index][swizzle],
486                            "load addr reg");
487
488   /* for indexing we want integers */
489   addr_vec = LLVMBuildFPToSI(bld->base.builder, addr_vec,
490                              int_vec_type, "");
491
492   /* addr_vec = addr_vec * 4 */
493   addr_vec = lp_build_mul(&bld->int_bld, addr_vec, vec4);
494
495   return addr_vec;
496}
497
498
499/**
500 * Register fetch.
501 */
502static LLVMValueRef
503emit_fetch(
504   struct lp_build_tgsi_soa_context *bld,
505   const struct tgsi_full_instruction *inst,
506   unsigned src_op,
507   const unsigned chan_index )
508{
509   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
510   const unsigned swizzle =
511      tgsi_util_get_full_src_register_swizzle(reg, chan_index);
512   LLVMValueRef res;
513   LLVMValueRef addr_vec = NULL;
514
515   if (swizzle > 3) {
516      assert(0 && "invalid swizzle in emit_fetch()");
517      return bld->base.undef;
518   }
519
520   if (reg->Register.Indirect) {
521      assert(bld->indirect_files);
522      addr_vec = get_indirect_offsets(bld, &reg->Indirect);
523   }
524
525   switch (reg->Register.File) {
526   case TGSI_FILE_CONSTANT:
527      if (reg->Register.Indirect) {
528         LLVMValueRef index_vec;  /* index into the const buffer */
529
530         assert(bld->indirect_files & (1 << TGSI_FILE_CONSTANT));
531
532         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
533         index_vec = lp_build_const_int_vec(bld->int_bld.type,
534                                            reg->Register.Index * 4 + swizzle);
535
536         /* index_vec = index_vec + addr_vec */
537         index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec);
538
539         /* Gather values from the constant buffer */
540         res = build_gather(bld, bld->consts_ptr, index_vec);
541      }
542      else {
543         LLVMValueRef index;  /* index into the const buffer */
544         LLVMValueRef scalar, scalar_ptr;
545
546         index = lp_build_const_int32(reg->Register.Index*4 + swizzle);
547
548         scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr,
549                                   &index, 1, "");
550         scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
551
552         res = lp_build_broadcast_scalar(&bld->base, scalar);
553      }
554      break;
555
556   case TGSI_FILE_IMMEDIATE:
557      res = bld->immediates[reg->Register.Index][swizzle];
558      assert(res);
559      break;
560
561   case TGSI_FILE_INPUT:
562      res = bld->inputs[reg->Register.Index][swizzle];
563      assert(res);
564      break;
565
566   case TGSI_FILE_TEMPORARY:
567      if (reg->Register.Indirect) {
568         LLVMValueRef vec_len =
569            lp_build_const_int_vec(bld->int_bld.type, bld->base.type.length);
570         LLVMValueRef index_vec;  /* index into the const buffer */
571         LLVMValueRef temps_array;
572         LLVMTypeRef float4_ptr_type;
573
574         assert(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY));
575
576         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
577         index_vec = lp_build_const_int_vec(bld->int_bld.type,
578                                            reg->Register.Index * 4 + swizzle);
579
580         /* index_vec += addr_vec */
581         index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec);
582
583         /* index_vec *= vector_length */
584         index_vec = lp_build_mul(&bld->int_bld, index_vec, vec_len);
585
586         /* cast temps_array pointer to float* */
587         float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
588         temps_array = LLVMBuildBitCast(bld->int_bld.builder, bld->temps_array,
589                                        float4_ptr_type, "");
590
591         /* Gather values from the temporary register array */
592         res = build_gather(bld, temps_array, index_vec);
593      }
594      else {
595         LLVMValueRef temp_ptr;
596         temp_ptr = get_temp_ptr(bld, reg->Register.Index, swizzle);
597         res = LLVMBuildLoad(bld->base.builder, temp_ptr, "");
598         if (!res)
599            return bld->base.undef;
600      }
601      break;
602
603   default:
604      assert(0 && "invalid src register in emit_fetch()");
605      return bld->base.undef;
606   }
607
608   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
609   case TGSI_UTIL_SIGN_CLEAR:
610      res = lp_build_abs( &bld->base, res );
611      break;
612
613   case TGSI_UTIL_SIGN_SET:
614      res = lp_build_abs( &bld->base, res );
615      /* fall through */
616   case TGSI_UTIL_SIGN_TOGGLE:
617      res = lp_build_negate( &bld->base, res );
618      break;
619
620   case TGSI_UTIL_SIGN_KEEP:
621      break;
622   }
623
624   return res;
625}
626
627
628/**
629 * Register fetch with derivatives.
630 */
631static void
632emit_fetch_deriv(
633   struct lp_build_tgsi_soa_context *bld,
634   const struct tgsi_full_instruction *inst,
635   unsigned index,
636   const unsigned chan_index,
637   LLVMValueRef *res,
638   LLVMValueRef *ddx,
639   LLVMValueRef *ddy)
640{
641   LLVMValueRef src;
642
643   src = emit_fetch(bld, inst, index, chan_index);
644
645   if(res)
646      *res = src;
647
648   /* TODO: use interpolation coeffs for inputs */
649
650   if(ddx)
651      *ddx = lp_build_ddx(&bld->base, src);
652
653   if(ddy)
654      *ddy = lp_build_ddy(&bld->base, src);
655}
656
657
658/**
659 * Predicate.
660 */
661static void
662emit_fetch_predicate(
663   struct lp_build_tgsi_soa_context *bld,
664   const struct tgsi_full_instruction *inst,
665   LLVMValueRef *pred)
666{
667   unsigned index;
668   unsigned char swizzles[4];
669   LLVMValueRef unswizzled[4] = {NULL, NULL, NULL, NULL};
670   LLVMValueRef value;
671   unsigned chan;
672
673   if (!inst->Instruction.Predicate) {
674      FOR_EACH_CHANNEL( chan ) {
675         pred[chan] = NULL;
676      }
677      return;
678   }
679
680   swizzles[0] = inst->Predicate.SwizzleX;
681   swizzles[1] = inst->Predicate.SwizzleY;
682   swizzles[2] = inst->Predicate.SwizzleZ;
683   swizzles[3] = inst->Predicate.SwizzleW;
684
685   index = inst->Predicate.Index;
686   assert(index < LP_MAX_TGSI_PREDS);
687
688   FOR_EACH_CHANNEL( chan ) {
689      unsigned swizzle = swizzles[chan];
690
691      /*
692       * Only fetch the predicate register channels that are actually listed
693       * in the swizzles
694       */
695      if (!unswizzled[swizzle]) {
696         value = LLVMBuildLoad(bld->base.builder,
697                               bld->preds[index][swizzle], "");
698
699         /*
700          * Convert the value to an integer mask.
701          *
702          * TODO: Short-circuit this comparison -- a D3D setp_xx instructions
703          * is needlessly causing two comparisons due to storing the intermediate
704          * result as float vector instead of an integer mask vector.
705          */
706         value = lp_build_compare(bld->base.builder,
707                                  bld->base.type,
708                                  PIPE_FUNC_NOTEQUAL,
709                                  value,
710                                  bld->base.zero);
711         if (inst->Predicate.Negate) {
712            value = LLVMBuildNot(bld->base.builder, value, "");
713         }
714
715         unswizzled[swizzle] = value;
716      } else {
717         value = unswizzled[swizzle];
718      }
719
720      pred[chan] = value;
721   }
722}
723
724
725/**
726 * Register store.
727 */
728static void
729emit_store(
730   struct lp_build_tgsi_soa_context *bld,
731   const struct tgsi_full_instruction *inst,
732   unsigned index,
733   unsigned chan_index,
734   LLVMValueRef pred,
735   LLVMValueRef value)
736{
737   const struct tgsi_full_dst_register *reg = &inst->Dst[index];
738   LLVMValueRef addr = NULL;
739
740   switch( inst->Instruction.Saturate ) {
741   case TGSI_SAT_NONE:
742      break;
743
744   case TGSI_SAT_ZERO_ONE:
745      value = lp_build_max(&bld->base, value, bld->base.zero);
746      value = lp_build_min(&bld->base, value, bld->base.one);
747      break;
748
749   case TGSI_SAT_MINUS_PLUS_ONE:
750      value = lp_build_max(&bld->base, value, lp_build_const_vec(bld->base.type, -1.0));
751      value = lp_build_min(&bld->base, value, bld->base.one);
752      break;
753
754   default:
755      assert(0);
756   }
757
758   if (reg->Register.Indirect) {
759      /* XXX use get_indirect_offsets() here eventually */
760      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
761      unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
762
763      assert(bld->indirect_files);
764
765      addr = LLVMBuildLoad(bld->base.builder,
766                           bld->addr[reg->Indirect.Index][swizzle],
767                           "");
768      /* for indexing we want integers */
769      addr = LLVMBuildFPToSI(bld->base.builder, addr,
770                             int_vec_type, "");
771      addr = LLVMBuildExtractElement(bld->base.builder,
772                                     addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
773                                     "");
774      addr = LLVMBuildMul(bld->base.builder,
775                          addr, LLVMConstInt(LLVMInt32Type(), 4, 0),
776                          "");
777   }
778
779   switch( reg->Register.File ) {
780   case TGSI_FILE_OUTPUT:
781      lp_exec_mask_store(&bld->exec_mask, pred, value,
782                         bld->outputs[reg->Register.Index][chan_index]);
783      break;
784
785   case TGSI_FILE_TEMPORARY:
786      if (reg->Register.Indirect) {
787         /* XXX not done yet */
788         debug_printf("WARNING: LLVM scatter store of temp regs"
789                      " not implemented\n");
790      }
791      else {
792         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
793                                              chan_index);
794         lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr);
795      }
796      break;
797
798   case TGSI_FILE_ADDRESS:
799      lp_exec_mask_store(&bld->exec_mask, pred, value,
800                         bld->addr[reg->Indirect.Index][chan_index]);
801      break;
802
803   case TGSI_FILE_PREDICATE:
804      lp_exec_mask_store(&bld->exec_mask, pred, value,
805                         bld->preds[reg->Register.Index][chan_index]);
806      break;
807
808   default:
809      assert( 0 );
810   }
811}
812
813
814/**
815 * High-level instruction translators.
816 */
817
818static void
819emit_tex( struct lp_build_tgsi_soa_context *bld,
820          const struct tgsi_full_instruction *inst,
821          enum lp_build_tex_modifier modifier,
822          LLVMValueRef *texel)
823{
824   unsigned unit;
825   LLVMValueRef lod_bias, explicit_lod;
826   LLVMValueRef oow = NULL;
827   LLVMValueRef coords[3];
828   LLVMValueRef ddx[3];
829   LLVMValueRef ddy[3];
830   unsigned num_coords;
831   unsigned i;
832
833   if (!bld->sampler) {
834      _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
835      for (i = 0; i < 4; i++) {
836         texel[i] = bld->base.undef;
837      }
838      return;
839   }
840
841   switch (inst->Texture.Texture) {
842   case TGSI_TEXTURE_1D:
843      num_coords = 1;
844      break;
845   case TGSI_TEXTURE_2D:
846   case TGSI_TEXTURE_RECT:
847      num_coords = 2;
848      break;
849   case TGSI_TEXTURE_SHADOW1D:
850   case TGSI_TEXTURE_SHADOW2D:
851   case TGSI_TEXTURE_SHADOWRECT:
852   case TGSI_TEXTURE_3D:
853   case TGSI_TEXTURE_CUBE:
854      num_coords = 3;
855      break;
856   default:
857      assert(0);
858      return;
859   }
860
861   if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
862      lod_bias = emit_fetch( bld, inst, 0, 3 );
863      explicit_lod = NULL;
864   }
865   else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
866      lod_bias = NULL;
867      explicit_lod = emit_fetch( bld, inst, 0, 3 );
868   }
869   else {
870      lod_bias = NULL;
871      explicit_lod = NULL;
872   }
873
874   if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) {
875      oow = emit_fetch( bld, inst, 0, 3 );
876      oow = lp_build_rcp(&bld->base, oow);
877   }
878
879   for (i = 0; i < num_coords; i++) {
880      coords[i] = emit_fetch( bld, inst, 0, i );
881      if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
882         coords[i] = lp_build_mul(&bld->base, coords[i], oow);
883   }
884   for (i = num_coords; i < 3; i++) {
885      coords[i] = bld->base.undef;
886   }
887
888   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
889      for (i = 0; i < num_coords; i++) {
890         ddx[i] = emit_fetch( bld, inst, 1, i );
891         ddy[i] = emit_fetch( bld, inst, 2, i );
892      }
893      unit = inst->Src[3].Register.Index;
894   }  else {
895      for (i = 0; i < num_coords; i++) {
896         ddx[i] = lp_build_ddx( &bld->base, coords[i] );
897         ddy[i] = lp_build_ddy( &bld->base, coords[i] );
898      }
899      unit = inst->Src[1].Register.Index;
900   }
901   for (i = num_coords; i < 3; i++) {
902      ddx[i] = bld->base.undef;
903      ddy[i] = bld->base.undef;
904   }
905
906   bld->sampler->emit_fetch_texel(bld->sampler,
907                                  bld->base.builder,
908                                  bld->base.type,
909                                  unit, num_coords, coords,
910                                  ddx, ddy,
911                                  lod_bias, explicit_lod,
912                                  texel);
913}
914
915
916/**
917 * Kill fragment if any of the src register values are negative.
918 */
919static void
920emit_kil(
921   struct lp_build_tgsi_soa_context *bld,
922   const struct tgsi_full_instruction *inst )
923{
924   const struct tgsi_full_src_register *reg = &inst->Src[0];
925   LLVMValueRef terms[NUM_CHANNELS];
926   LLVMValueRef mask;
927   unsigned chan_index;
928
929   memset(&terms, 0, sizeof terms);
930
931   FOR_EACH_CHANNEL( chan_index ) {
932      unsigned swizzle;
933
934      /* Unswizzle channel */
935      swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
936
937      /* Check if the component has not been already tested. */
938      assert(swizzle < NUM_CHANNELS);
939      if( !terms[swizzle] )
940         /* TODO: change the comparison operator instead of setting the sign */
941         terms[swizzle] =  emit_fetch(bld, inst, 0, chan_index );
942   }
943
944   mask = NULL;
945   FOR_EACH_CHANNEL( chan_index ) {
946      if(terms[chan_index]) {
947         LLVMValueRef chan_mask;
948
949         /*
950          * If term < 0 then mask = 0 else mask = ~0.
951          */
952         chan_mask = lp_build_cmp(&bld->base, PIPE_FUNC_GEQUAL, terms[chan_index], bld->base.zero);
953
954         if(mask)
955            mask = LLVMBuildAnd(bld->base.builder, mask, chan_mask, "");
956         else
957            mask = chan_mask;
958      }
959   }
960
961   if(mask)
962      lp_build_mask_update(bld->mask, mask);
963}
964
965
966/**
967 * Predicated fragment kill.
968 * XXX Actually, we do an unconditional kill (as in tgsi_exec.c).
969 * The only predication is the execution mask which will apply if
970 * we're inside a loop or conditional.
971 */
972static void
973emit_kilp(struct lp_build_tgsi_soa_context *bld,
974          const struct tgsi_full_instruction *inst)
975{
976   LLVMValueRef mask;
977
978   /* For those channels which are "alive", disable fragment shader
979    * execution.
980    */
981   if (bld->exec_mask.has_mask) {
982      mask = LLVMBuildNot(bld->base.builder, bld->exec_mask.exec_mask, "kilp");
983   }
984   else {
985      mask = bld->base.zero;
986   }
987
988   lp_build_mask_update(bld->mask, mask);
989}
990
991static void
992emit_declaration(
993   struct lp_build_tgsi_soa_context *bld,
994   const struct tgsi_full_declaration *decl)
995{
996   LLVMTypeRef vec_type = lp_build_vec_type(bld->base.type);
997
998   unsigned first = decl->Range.First;
999   unsigned last = decl->Range.Last;
1000   unsigned idx, i;
1001
1002   for (idx = first; idx <= last; ++idx) {
1003      switch (decl->Declaration.File) {
1004      case TGSI_FILE_TEMPORARY:
1005         assert(idx < LP_MAX_TGSI_TEMPS);
1006         if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1007            LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
1008                                                   last*4 + 4, 0);
1009            bld->temps_array = lp_build_array_alloca(bld->base.builder,
1010                                                     vec_type, array_size, "");
1011         } else {
1012            for (i = 0; i < NUM_CHANNELS; i++)
1013               bld->temps[idx][i] = lp_build_alloca(bld->base.builder,
1014                                                    vec_type, "");
1015         }
1016         break;
1017
1018      case TGSI_FILE_OUTPUT:
1019         for (i = 0; i < NUM_CHANNELS; i++)
1020            bld->outputs[idx][i] = lp_build_alloca(bld->base.builder,
1021                                                   vec_type, "");
1022         break;
1023
1024      case TGSI_FILE_ADDRESS:
1025         assert(idx < LP_MAX_TGSI_ADDRS);
1026         for (i = 0; i < NUM_CHANNELS; i++)
1027            bld->addr[idx][i] = lp_build_alloca(bld->base.builder,
1028                                                vec_type, "");
1029         break;
1030
1031      case TGSI_FILE_PREDICATE:
1032         assert(idx < LP_MAX_TGSI_PREDS);
1033         for (i = 0; i < NUM_CHANNELS; i++)
1034            bld->preds[idx][i] = lp_build_alloca(bld->base.builder,
1035                                                 vec_type, "");
1036         break;
1037
1038      default:
1039         /* don't need to declare other vars */
1040         break;
1041      }
1042   }
1043}
1044
1045
1046/**
1047 * Emit LLVM for one TGSI instruction.
1048 * \param return TRUE for success, FALSE otherwise
1049 */
1050static boolean
1051emit_instruction(
1052   struct lp_build_tgsi_soa_context *bld,
1053   const struct tgsi_full_instruction *inst,
1054   const struct tgsi_opcode_info *info,
1055   int *pc)
1056{
1057   unsigned chan_index;
1058   LLVMValueRef src0, src1, src2;
1059   LLVMValueRef tmp0, tmp1, tmp2;
1060   LLVMValueRef tmp3 = NULL;
1061   LLVMValueRef tmp4 = NULL;
1062   LLVMValueRef tmp5 = NULL;
1063   LLVMValueRef tmp6 = NULL;
1064   LLVMValueRef tmp7 = NULL;
1065   LLVMValueRef res;
1066   LLVMValueRef dst0[NUM_CHANNELS];
1067
1068   /*
1069    * Stores and write masks are handled in a general fashion after the long
1070    * instruction opcode switch statement.
1071    *
1072    * Although not stricitly necessary, we avoid generating instructions for
1073    * channels which won't be stored, in cases where's that easy. For some
1074    * complex instructions, like texture sampling, it is more convenient to
1075    * assume a full writemask and then let LLVM optimization passes eliminate
1076    * redundant code.
1077    */
1078
1079   (*pc)++;
1080
1081   assert(info->num_dst <= 1);
1082   if (info->num_dst) {
1083      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1084         dst0[chan_index] = bld->base.undef;
1085      }
1086   }
1087
1088   switch (inst->Instruction.Opcode) {
1089   case TGSI_OPCODE_ARL:
1090      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1091         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1092         tmp0 = lp_build_floor(&bld->base, tmp0);
1093         dst0[chan_index] = tmp0;
1094      }
1095      break;
1096
1097   case TGSI_OPCODE_MOV:
1098      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1099         dst0[chan_index] = emit_fetch( bld, inst, 0, chan_index );
1100      }
1101      break;
1102
1103   case TGSI_OPCODE_LIT:
1104      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ) {
1105         dst0[CHAN_X] = bld->base.one;
1106      }
1107      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
1108         src0 = emit_fetch( bld, inst, 0, CHAN_X );
1109         dst0[CHAN_Y] = lp_build_max( &bld->base, src0, bld->base.zero);
1110      }
1111      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
1112         /* XMM[1] = SrcReg[0].yyyy */
1113         tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1114         /* XMM[1] = max(XMM[1], 0) */
1115         tmp1 = lp_build_max( &bld->base, tmp1, bld->base.zero);
1116         /* XMM[2] = SrcReg[0].wwww */
1117         tmp2 = emit_fetch( bld, inst, 0, CHAN_W );
1118         tmp1 = lp_build_pow( &bld->base, tmp1, tmp2);
1119         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1120         tmp2 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, tmp0, bld->base.zero);
1121         dst0[CHAN_Z] = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
1122      }
1123      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) ) {
1124         dst0[CHAN_W] = bld->base.one;
1125      }
1126      break;
1127
1128   case TGSI_OPCODE_RCP:
1129   /* TGSI_OPCODE_RECIP */
1130      src0 = emit_fetch( bld, inst, 0, CHAN_X );
1131      res = lp_build_rcp(&bld->base, src0);
1132      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1133         dst0[chan_index] = res;
1134      }
1135      break;
1136
1137   case TGSI_OPCODE_RSQ:
1138   /* TGSI_OPCODE_RECIPSQRT */
1139      src0 = emit_fetch( bld, inst, 0, CHAN_X );
1140      src0 = lp_build_abs(&bld->base, src0);
1141      res = lp_build_rsqrt(&bld->base, src0);
1142      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1143         dst0[chan_index] = res;
1144      }
1145      break;
1146
1147   case TGSI_OPCODE_EXP:
1148      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1149          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
1150          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
1151         LLVMValueRef *p_exp2_int_part = NULL;
1152         LLVMValueRef *p_frac_part = NULL;
1153         LLVMValueRef *p_exp2 = NULL;
1154
1155         src0 = emit_fetch( bld, inst, 0, CHAN_X );
1156
1157         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1158            p_exp2_int_part = &tmp0;
1159         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
1160            p_frac_part = &tmp1;
1161         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1162            p_exp2 = &tmp2;
1163
1164         lp_build_exp2_approx(&bld->base, src0, p_exp2_int_part, p_frac_part, p_exp2);
1165
1166         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1167            dst0[CHAN_X] = tmp0;
1168         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
1169            dst0[CHAN_Y] = tmp1;
1170         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1171            dst0[CHAN_Z] = tmp2;
1172      }
1173      /* dst.w = 1.0 */
1174      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
1175         dst0[CHAN_W] = bld->base.one;
1176      }
1177      break;
1178
1179   case TGSI_OPCODE_LOG:
1180      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1181          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
1182          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
1183         LLVMValueRef *p_floor_log2 = NULL;
1184         LLVMValueRef *p_exp = NULL;
1185         LLVMValueRef *p_log2 = NULL;
1186
1187         src0 = emit_fetch( bld, inst, 0, CHAN_X );
1188         src0 = lp_build_abs( &bld->base, src0 );
1189
1190         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1191            p_floor_log2 = &tmp0;
1192         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
1193            p_exp = &tmp1;
1194         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1195            p_log2 = &tmp2;
1196
1197         lp_build_log2_approx(&bld->base, src0, p_exp, p_floor_log2, p_log2);
1198
1199         /* dst.x = floor(lg2(abs(src.x))) */
1200         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1201            dst0[CHAN_X] = tmp0;
1202         /* dst.y = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1203         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y )) {
1204            dst0[CHAN_Y] = lp_build_div( &bld->base, src0, tmp1);
1205         }
1206         /* dst.z = lg2(abs(src.x)) */
1207         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1208            dst0[CHAN_Z] = tmp2;
1209      }
1210      /* dst.w = 1.0 */
1211      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
1212         dst0[CHAN_W] = bld->base.one;
1213      }
1214      break;
1215
1216   case TGSI_OPCODE_MUL:
1217      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1218         src0 = emit_fetch( bld, inst, 0, chan_index );
1219         src1 = emit_fetch( bld, inst, 1, chan_index );
1220         dst0[chan_index] = lp_build_mul(&bld->base, src0, src1);
1221      }
1222      break;
1223
1224   case TGSI_OPCODE_ADD:
1225      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1226         src0 = emit_fetch( bld, inst, 0, chan_index );
1227         src1 = emit_fetch( bld, inst, 1, chan_index );
1228         dst0[chan_index] = lp_build_add(&bld->base, src0, src1);
1229      }
1230      break;
1231
1232   case TGSI_OPCODE_DP3:
1233   /* TGSI_OPCODE_DOT3 */
1234      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1235      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
1236      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1237      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1238      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
1239      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1240      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1241      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
1242      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
1243      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1244      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1245      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1246         dst0[chan_index] = tmp0;
1247      }
1248      break;
1249
1250   case TGSI_OPCODE_DP4:
1251   /* TGSI_OPCODE_DOT4 */
1252      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1253      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
1254      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1255      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1256      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
1257      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1258      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1259      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
1260      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
1261      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1262      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1263      tmp1 = emit_fetch( bld, inst, 0, CHAN_W );
1264      tmp2 = emit_fetch( bld, inst, 1, CHAN_W );
1265      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1266      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1267      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1268         dst0[chan_index] = tmp0;
1269      }
1270      break;
1271
1272   case TGSI_OPCODE_DST:
1273      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
1274         dst0[CHAN_X] = bld->base.one;
1275      }
1276      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
1277         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
1278         tmp1 = emit_fetch( bld, inst, 1, CHAN_Y );
1279         dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp0, tmp1);
1280      }
1281      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
1282         dst0[CHAN_Z] = emit_fetch( bld, inst, 0, CHAN_Z );
1283      }
1284      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
1285         dst0[CHAN_W] = emit_fetch( bld, inst, 1, CHAN_W );
1286      }
1287      break;
1288
1289   case TGSI_OPCODE_MIN:
1290      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1291         src0 = emit_fetch( bld, inst, 0, chan_index );
1292         src1 = emit_fetch( bld, inst, 1, chan_index );
1293         dst0[chan_index] = lp_build_min( &bld->base, src0, src1 );
1294      }
1295      break;
1296
1297   case TGSI_OPCODE_MAX:
1298      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1299         src0 = emit_fetch( bld, inst, 0, chan_index );
1300         src1 = emit_fetch( bld, inst, 1, chan_index );
1301         dst0[chan_index] = lp_build_max( &bld->base, src0, src1 );
1302      }
1303      break;
1304
1305   case TGSI_OPCODE_SLT:
1306   /* TGSI_OPCODE_SETLT */
1307      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1308         src0 = emit_fetch( bld, inst, 0, chan_index );
1309         src1 = emit_fetch( bld, inst, 1, chan_index );
1310         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, src1 );
1311         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1312      }
1313      break;
1314
1315   case TGSI_OPCODE_SGE:
1316   /* TGSI_OPCODE_SETGE */
1317      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1318         src0 = emit_fetch( bld, inst, 0, chan_index );
1319         src1 = emit_fetch( bld, inst, 1, chan_index );
1320         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GEQUAL, src0, src1 );
1321         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1322      }
1323      break;
1324
1325   case TGSI_OPCODE_MAD:
1326   /* TGSI_OPCODE_MADD */
1327      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1328         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1329         tmp1 = emit_fetch( bld, inst, 1, chan_index );
1330         tmp2 = emit_fetch( bld, inst, 2, chan_index );
1331         tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1332         tmp0 = lp_build_add( &bld->base, tmp0, tmp2);
1333         dst0[chan_index] = tmp0;
1334      }
1335      break;
1336
1337   case TGSI_OPCODE_SUB:
1338      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1339         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1340         tmp1 = emit_fetch( bld, inst, 1, chan_index );
1341         dst0[chan_index] = lp_build_sub( &bld->base, tmp0, tmp1);
1342      }
1343      break;
1344
1345   case TGSI_OPCODE_LRP:
1346      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1347         src0 = emit_fetch( bld, inst, 0, chan_index );
1348         src1 = emit_fetch( bld, inst, 1, chan_index );
1349         src2 = emit_fetch( bld, inst, 2, chan_index );
1350         tmp0 = lp_build_sub( &bld->base, src1, src2 );
1351         tmp0 = lp_build_mul( &bld->base, src0, tmp0 );
1352         dst0[chan_index] = lp_build_add( &bld->base, tmp0, src2 );
1353      }
1354      break;
1355
1356   case TGSI_OPCODE_CND:
1357      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1358         src0 = emit_fetch( bld, inst, 0, chan_index );
1359         src1 = emit_fetch( bld, inst, 1, chan_index );
1360         src2 = emit_fetch( bld, inst, 2, chan_index );
1361         tmp1 = lp_build_const_vec(bld->base.type, 0.5);
1362         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
1363         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
1364      }
1365      break;
1366
1367   case TGSI_OPCODE_DP2A:
1368      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
1369      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
1370      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
1371      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
1372      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
1373      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
1374      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
1375      tmp1 = emit_fetch( bld, inst, 2, CHAN_X );  /* xmm1 = src[2].x */
1376      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
1377      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1378         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
1379      }
1380      break;
1381
1382   case TGSI_OPCODE_FRC:
1383      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1384         src0 = emit_fetch( bld, inst, 0, chan_index );
1385         tmp0 = lp_build_floor(&bld->base, src0);
1386         tmp0 = lp_build_sub(&bld->base, src0, tmp0);
1387         dst0[chan_index] = tmp0;
1388      }
1389      break;
1390
1391   case TGSI_OPCODE_CLAMP:
1392      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1393         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1394         src1 = emit_fetch( bld, inst, 1, chan_index );
1395         src2 = emit_fetch( bld, inst, 2, chan_index );
1396         tmp0 = lp_build_max(&bld->base, tmp0, src1);
1397         tmp0 = lp_build_min(&bld->base, tmp0, src2);
1398         dst0[chan_index] = tmp0;
1399      }
1400      break;
1401
1402   case TGSI_OPCODE_FLR:
1403      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1404         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1405         dst0[chan_index] = lp_build_floor(&bld->base, tmp0);
1406      }
1407      break;
1408
1409   case TGSI_OPCODE_ROUND:
1410      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1411         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1412         dst0[chan_index] = lp_build_round(&bld->base, tmp0);
1413      }
1414      break;
1415
1416   case TGSI_OPCODE_EX2: {
1417      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1418      tmp0 = lp_build_exp2( &bld->base, tmp0);
1419      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1420         dst0[chan_index] = tmp0;
1421      }
1422      break;
1423   }
1424
1425   case TGSI_OPCODE_LG2:
1426      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1427      tmp0 = lp_build_log2( &bld->base, tmp0);
1428      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1429         dst0[chan_index] = tmp0;
1430      }
1431      break;
1432
1433   case TGSI_OPCODE_POW:
1434      src0 = emit_fetch( bld, inst, 0, CHAN_X );
1435      src1 = emit_fetch( bld, inst, 1, CHAN_X );
1436      res = lp_build_pow( &bld->base, src0, src1 );
1437      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1438         dst0[chan_index] = res;
1439      }
1440      break;
1441
1442   case TGSI_OPCODE_XPD:
1443      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1444          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
1445         tmp1 = emit_fetch( bld, inst, 1, CHAN_Z );
1446         tmp3 = emit_fetch( bld, inst, 0, CHAN_Z );
1447      }
1448      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1449          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
1450         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
1451         tmp4 = emit_fetch( bld, inst, 1, CHAN_Y );
1452      }
1453      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
1454         tmp2 = tmp0;
1455         tmp2 = lp_build_mul( &bld->base, tmp2, tmp1);
1456         tmp5 = tmp3;
1457         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
1458         tmp2 = lp_build_sub( &bld->base, tmp2, tmp5);
1459         dst0[CHAN_X] = tmp2;
1460      }
1461      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
1462          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
1463         tmp2 = emit_fetch( bld, inst, 1, CHAN_X );
1464         tmp5 = emit_fetch( bld, inst, 0, CHAN_X );
1465      }
1466      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
1467         tmp3 = lp_build_mul( &bld->base, tmp3, tmp2);
1468         tmp1 = lp_build_mul( &bld->base, tmp1, tmp5);
1469         tmp3 = lp_build_sub( &bld->base, tmp3, tmp1);
1470         dst0[CHAN_Y] = tmp3;
1471      }
1472      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
1473         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
1474         tmp0 = lp_build_mul( &bld->base, tmp0, tmp2);
1475         tmp5 = lp_build_sub( &bld->base, tmp5, tmp0);
1476         dst0[CHAN_Z] = tmp5;
1477      }
1478      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
1479         dst0[CHAN_W] = bld->base.one;
1480      }
1481      break;
1482
1483   case TGSI_OPCODE_ABS:
1484      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1485         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1486         dst0[chan_index] = lp_build_abs( &bld->base, tmp0 );
1487      }
1488      break;
1489
1490   case TGSI_OPCODE_RCC:
1491      /* deprecated? */
1492      assert(0);
1493      return FALSE;
1494
1495   case TGSI_OPCODE_DPH:
1496      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1497      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
1498      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1499      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1500      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
1501      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1502      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1503      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
1504      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
1505      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1506      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1507      tmp1 = emit_fetch( bld, inst, 1, CHAN_W );
1508      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1509      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1510         dst0[chan_index] = tmp0;
1511      }
1512      break;
1513
1514   case TGSI_OPCODE_COS:
1515      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1516      tmp0 = lp_build_cos( &bld->base, tmp0 );
1517      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1518         dst0[chan_index] = tmp0;
1519      }
1520      break;
1521
1522   case TGSI_OPCODE_DDX:
1523      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1524         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, &dst0[chan_index], NULL);
1525      }
1526      break;
1527
1528   case TGSI_OPCODE_DDY:
1529      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1530         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, NULL, &dst0[chan_index]);
1531      }
1532      break;
1533
1534   case TGSI_OPCODE_KILP:
1535      /* predicated kill */
1536      emit_kilp( bld, inst );
1537      break;
1538
1539   case TGSI_OPCODE_KIL:
1540      /* conditional kill */
1541      emit_kil( bld, inst );
1542      break;
1543
1544   case TGSI_OPCODE_PK2H:
1545      return FALSE;
1546      break;
1547
1548   case TGSI_OPCODE_PK2US:
1549      return FALSE;
1550      break;
1551
1552   case TGSI_OPCODE_PK4B:
1553      return FALSE;
1554      break;
1555
1556   case TGSI_OPCODE_PK4UB:
1557      return FALSE;
1558      break;
1559
1560   case TGSI_OPCODE_RFL:
1561      return FALSE;
1562      break;
1563
1564   case TGSI_OPCODE_SEQ:
1565      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1566         src0 = emit_fetch( bld, inst, 0, chan_index );
1567         src1 = emit_fetch( bld, inst, 1, chan_index );
1568         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_EQUAL, src0, src1 );
1569         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1570      }
1571      break;
1572
1573   case TGSI_OPCODE_SFL:
1574      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1575         dst0[chan_index] = bld->base.zero;
1576      }
1577      break;
1578
1579   case TGSI_OPCODE_SGT:
1580      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1581         src0 = emit_fetch( bld, inst, 0, chan_index );
1582         src1 = emit_fetch( bld, inst, 1, chan_index );
1583         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src0, src1 );
1584         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1585      }
1586      break;
1587
1588   case TGSI_OPCODE_SIN:
1589      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1590      tmp0 = lp_build_sin( &bld->base, tmp0 );
1591      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1592         dst0[chan_index] = tmp0;
1593      }
1594      break;
1595
1596   case TGSI_OPCODE_SLE:
1597      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1598         src0 = emit_fetch( bld, inst, 0, chan_index );
1599         src1 = emit_fetch( bld, inst, 1, chan_index );
1600         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LEQUAL, src0, src1 );
1601         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1602      }
1603      break;
1604
1605   case TGSI_OPCODE_SNE:
1606      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1607         src0 = emit_fetch( bld, inst, 0, chan_index );
1608         src1 = emit_fetch( bld, inst, 1, chan_index );
1609         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_NOTEQUAL, src0, src1 );
1610         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1611      }
1612      break;
1613
1614   case TGSI_OPCODE_STR:
1615      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1616         dst0[chan_index] = bld->base.one;
1617      }
1618      break;
1619
1620   case TGSI_OPCODE_TEX:
1621      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_NONE, dst0 );
1622      break;
1623
1624   case TGSI_OPCODE_TXD:
1625      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV, dst0 );
1626      break;
1627
1628   case TGSI_OPCODE_UP2H:
1629      /* deprecated */
1630      assert (0);
1631      return FALSE;
1632      break;
1633
1634   case TGSI_OPCODE_UP2US:
1635      /* deprecated */
1636      assert(0);
1637      return FALSE;
1638      break;
1639
1640   case TGSI_OPCODE_UP4B:
1641      /* deprecated */
1642      assert(0);
1643      return FALSE;
1644      break;
1645
1646   case TGSI_OPCODE_UP4UB:
1647      /* deprecated */
1648      assert(0);
1649      return FALSE;
1650      break;
1651
1652   case TGSI_OPCODE_X2D:
1653      /* deprecated? */
1654      assert(0);
1655      return FALSE;
1656      break;
1657
1658   case TGSI_OPCODE_ARA:
1659      /* deprecated */
1660      assert(0);
1661      return FALSE;
1662      break;
1663
1664   case TGSI_OPCODE_ARR:
1665      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1666         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1667         tmp0 = lp_build_round(&bld->base, tmp0);
1668         dst0[chan_index] = tmp0;
1669      }
1670      break;
1671
1672   case TGSI_OPCODE_BRA:
1673      /* deprecated */
1674      assert(0);
1675      return FALSE;
1676      break;
1677
1678   case TGSI_OPCODE_CAL:
1679      lp_exec_mask_call(&bld->exec_mask,
1680                        inst->Label.Label,
1681                        pc);
1682
1683      break;
1684
1685   case TGSI_OPCODE_RET:
1686      lp_exec_mask_ret(&bld->exec_mask, pc);
1687      break;
1688
1689   case TGSI_OPCODE_END:
1690      *pc = -1;
1691      break;
1692
1693   case TGSI_OPCODE_SSG:
1694   /* TGSI_OPCODE_SGN */
1695      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1696         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1697         dst0[chan_index] = lp_build_sgn( &bld->base, tmp0 );
1698      }
1699      break;
1700
1701   case TGSI_OPCODE_CMP:
1702      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1703         src0 = emit_fetch( bld, inst, 0, chan_index );
1704         src1 = emit_fetch( bld, inst, 1, chan_index );
1705         src2 = emit_fetch( bld, inst, 2, chan_index );
1706         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, bld->base.zero );
1707         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src1, src2);
1708      }
1709      break;
1710
1711   case TGSI_OPCODE_SCS:
1712      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
1713         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1714         dst0[CHAN_X] = lp_build_cos( &bld->base, tmp0 );
1715      }
1716      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
1717         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1718         dst0[CHAN_Y] = lp_build_sin( &bld->base, tmp0 );
1719      }
1720      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
1721         dst0[CHAN_Z] = bld->base.zero;
1722      }
1723      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
1724         dst0[CHAN_W] = bld->base.one;
1725      }
1726      break;
1727
1728   case TGSI_OPCODE_TXB:
1729      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS, dst0 );
1730      break;
1731
1732   case TGSI_OPCODE_NRM:
1733      /* fall-through */
1734   case TGSI_OPCODE_NRM4:
1735      /* 3 or 4-component normalization */
1736      {
1737         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
1738
1739         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) ||
1740             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y) ||
1741             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z) ||
1742             (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 4)) {
1743
1744            /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
1745
1746            /* xmm4 = src.x */
1747            /* xmm0 = src.x * src.x */
1748            tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
1749            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
1750               tmp4 = tmp0;
1751            }
1752            tmp0 = lp_build_mul( &bld->base, tmp0, tmp0);
1753
1754            /* xmm5 = src.y */
1755            /* xmm0 = xmm0 + src.y * src.y */
1756            tmp1 = emit_fetch(bld, inst, 0, CHAN_Y);
1757            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
1758               tmp5 = tmp1;
1759            }
1760            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
1761            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1762
1763            /* xmm6 = src.z */
1764            /* xmm0 = xmm0 + src.z * src.z */
1765            tmp1 = emit_fetch(bld, inst, 0, CHAN_Z);
1766            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
1767               tmp6 = tmp1;
1768            }
1769            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
1770            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1771
1772            if (dims == 4) {
1773               /* xmm7 = src.w */
1774               /* xmm0 = xmm0 + src.w * src.w */
1775               tmp1 = emit_fetch(bld, inst, 0, CHAN_W);
1776               if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W)) {
1777                  tmp7 = tmp1;
1778               }
1779               tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
1780               tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1781            }
1782
1783            /* xmm1 = 1 / sqrt(xmm0) */
1784            tmp1 = lp_build_rsqrt( &bld->base, tmp0);
1785
1786            /* dst.x = xmm1 * src.x */
1787            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
1788               dst0[CHAN_X] = lp_build_mul( &bld->base, tmp4, tmp1);
1789            }
1790
1791            /* dst.y = xmm1 * src.y */
1792            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
1793               dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp5, tmp1);
1794            }
1795
1796            /* dst.z = xmm1 * src.z */
1797            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
1798               dst0[CHAN_Z] = lp_build_mul( &bld->base, tmp6, tmp1);
1799            }
1800
1801            /* dst.w = xmm1 * src.w */
1802            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) && dims == 4) {
1803               dst0[CHAN_W] = lp_build_mul( &bld->base, tmp7, tmp1);
1804            }
1805         }
1806
1807         /* dst.w = 1.0 */
1808         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 3) {
1809            dst0[CHAN_W] = bld->base.one;
1810         }
1811      }
1812      break;
1813
1814   case TGSI_OPCODE_DIV:
1815      /* deprecated */
1816      assert( 0 );
1817      return FALSE;
1818      break;
1819
1820   case TGSI_OPCODE_DP2:
1821      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
1822      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
1823      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
1824      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
1825      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
1826      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
1827      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
1828      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1829         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
1830      }
1831      break;
1832
1833   case TGSI_OPCODE_TXL:
1834      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD, dst0 );
1835      break;
1836
1837   case TGSI_OPCODE_TXP:
1838      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_PROJECTED, dst0 );
1839      break;
1840
1841   case TGSI_OPCODE_BRK:
1842      lp_exec_break(&bld->exec_mask);
1843      break;
1844
1845   case TGSI_OPCODE_IF:
1846      tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
1847      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_NOTEQUAL,
1848                          tmp0, bld->base.zero);
1849      lp_exec_mask_cond_push(&bld->exec_mask, tmp0);
1850      break;
1851
1852   case TGSI_OPCODE_BGNLOOP:
1853      lp_exec_bgnloop(&bld->exec_mask);
1854      break;
1855
1856   case TGSI_OPCODE_BGNSUB:
1857      lp_exec_mask_bgnsub(&bld->exec_mask);
1858      break;
1859
1860   case TGSI_OPCODE_ELSE:
1861      lp_exec_mask_cond_invert(&bld->exec_mask);
1862      break;
1863
1864   case TGSI_OPCODE_ENDIF:
1865      lp_exec_mask_cond_pop(&bld->exec_mask);
1866      break;
1867
1868   case TGSI_OPCODE_ENDLOOP:
1869      lp_exec_endloop(&bld->exec_mask);
1870      break;
1871
1872   case TGSI_OPCODE_ENDSUB:
1873      lp_exec_mask_endsub(&bld->exec_mask, pc);
1874      break;
1875
1876   case TGSI_OPCODE_PUSHA:
1877      /* deprecated? */
1878      assert(0);
1879      return FALSE;
1880      break;
1881
1882   case TGSI_OPCODE_POPA:
1883      /* deprecated? */
1884      assert(0);
1885      return FALSE;
1886      break;
1887
1888   case TGSI_OPCODE_CEIL:
1889      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1890         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1891         dst0[chan_index] = lp_build_ceil(&bld->base, tmp0);
1892      }
1893      break;
1894
1895   case TGSI_OPCODE_I2F:
1896      /* deprecated? */
1897      assert(0);
1898      return FALSE;
1899      break;
1900
1901   case TGSI_OPCODE_NOT:
1902      /* deprecated? */
1903      assert(0);
1904      return FALSE;
1905      break;
1906
1907   case TGSI_OPCODE_TRUNC:
1908      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1909         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1910         dst0[chan_index] = lp_build_trunc(&bld->base, tmp0);
1911      }
1912      break;
1913
1914   case TGSI_OPCODE_SHL:
1915      /* deprecated? */
1916      assert(0);
1917      return FALSE;
1918      break;
1919
1920   case TGSI_OPCODE_ISHR:
1921      /* deprecated? */
1922      assert(0);
1923      return FALSE;
1924      break;
1925
1926   case TGSI_OPCODE_AND:
1927      /* deprecated? */
1928      assert(0);
1929      return FALSE;
1930      break;
1931
1932   case TGSI_OPCODE_OR:
1933      /* deprecated? */
1934      assert(0);
1935      return FALSE;
1936      break;
1937
1938   case TGSI_OPCODE_MOD:
1939      /* deprecated? */
1940      assert(0);
1941      return FALSE;
1942      break;
1943
1944   case TGSI_OPCODE_XOR:
1945      /* deprecated? */
1946      assert(0);
1947      return FALSE;
1948      break;
1949
1950   case TGSI_OPCODE_SAD:
1951      /* deprecated? */
1952      assert(0);
1953      return FALSE;
1954      break;
1955
1956   case TGSI_OPCODE_TXF:
1957      /* deprecated? */
1958      assert(0);
1959      return FALSE;
1960      break;
1961
1962   case TGSI_OPCODE_TXQ:
1963      /* deprecated? */
1964      assert(0);
1965      return FALSE;
1966      break;
1967
1968   case TGSI_OPCODE_CONT:
1969      lp_exec_continue(&bld->exec_mask);
1970      break;
1971
1972   case TGSI_OPCODE_EMIT:
1973      return FALSE;
1974      break;
1975
1976   case TGSI_OPCODE_ENDPRIM:
1977      return FALSE;
1978      break;
1979
1980   case TGSI_OPCODE_NOP:
1981      break;
1982
1983   default:
1984      return FALSE;
1985   }
1986
1987   if(info->num_dst) {
1988      LLVMValueRef pred[NUM_CHANNELS];
1989
1990      emit_fetch_predicate( bld, inst, pred );
1991
1992      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1993         emit_store( bld, inst, 0, chan_index, pred[chan_index], dst0[chan_index]);
1994      }
1995   }
1996
1997   return TRUE;
1998}
1999
2000
2001void
2002lp_build_tgsi_soa(LLVMBuilderRef builder,
2003                  const struct tgsi_token *tokens,
2004                  struct lp_type type,
2005                  struct lp_build_mask_context *mask,
2006                  LLVMValueRef consts_ptr,
2007                  const LLVMValueRef *pos,
2008                  const LLVMValueRef (*inputs)[NUM_CHANNELS],
2009                  LLVMValueRef (*outputs)[NUM_CHANNELS],
2010                  struct lp_build_sampler_soa *sampler,
2011                  const struct tgsi_shader_info *info)
2012{
2013   struct lp_build_tgsi_soa_context bld;
2014   struct tgsi_parse_context parse;
2015   uint num_immediates = 0;
2016   uint num_instructions = 0;
2017   unsigned i;
2018   int pc = 0;
2019
2020   /* Setup build context */
2021   memset(&bld, 0, sizeof bld);
2022   lp_build_context_init(&bld.base, builder, type);
2023   lp_build_context_init(&bld.int_bld, builder, lp_int_type(type));
2024   bld.mask = mask;
2025   bld.pos = pos;
2026   bld.inputs = inputs;
2027   bld.outputs = outputs;
2028   bld.consts_ptr = consts_ptr;
2029   bld.sampler = sampler;
2030   bld.indirect_files = info->indirect_files;
2031   bld.instructions = (struct tgsi_full_instruction *)
2032                      MALLOC( LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction) );
2033   bld.max_instructions = LP_MAX_INSTRUCTIONS;
2034
2035   if (!bld.instructions) {
2036      return;
2037   }
2038
2039   lp_exec_mask_init(&bld.exec_mask, &bld.base);
2040
2041   tgsi_parse_init( &parse, tokens );
2042
2043   while( !tgsi_parse_end_of_tokens( &parse ) ) {
2044      tgsi_parse_token( &parse );
2045
2046      switch( parse.FullToken.Token.Type ) {
2047      case TGSI_TOKEN_TYPE_DECLARATION:
2048         /* Inputs already interpolated */
2049         emit_declaration( &bld, &parse.FullToken.FullDeclaration );
2050         break;
2051
2052      case TGSI_TOKEN_TYPE_INSTRUCTION:
2053         {
2054            /* save expanded instruction */
2055            if (num_instructions == bld.max_instructions) {
2056               struct tgsi_full_instruction *instructions;
2057               instructions = REALLOC(bld.instructions,
2058                                      bld.max_instructions
2059                                      * sizeof(struct tgsi_full_instruction),
2060                                      (bld.max_instructions + LP_MAX_INSTRUCTIONS)
2061                                      * sizeof(struct tgsi_full_instruction));
2062               if (!instructions) {
2063                  break;
2064               }
2065               bld.instructions = instructions;
2066               bld.max_instructions += LP_MAX_INSTRUCTIONS;
2067            }
2068
2069            memcpy(bld.instructions + num_instructions,
2070                   &parse.FullToken.FullInstruction,
2071                   sizeof(bld.instructions[0]));
2072
2073            num_instructions++;
2074         }
2075
2076         break;
2077
2078      case TGSI_TOKEN_TYPE_IMMEDIATE:
2079         /* simply copy the immediate values into the next immediates[] slot */
2080         {
2081            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2082            assert(size <= 4);
2083            assert(num_immediates < LP_MAX_TGSI_IMMEDIATES);
2084            for( i = 0; i < size; ++i )
2085               bld.immediates[num_immediates][i] =
2086                  lp_build_const_vec(type, parse.FullToken.FullImmediate.u[i].Float);
2087            for( i = size; i < 4; ++i )
2088               bld.immediates[num_immediates][i] = bld.base.undef;
2089            num_immediates++;
2090         }
2091         break;
2092
2093      case TGSI_TOKEN_TYPE_PROPERTY:
2094         break;
2095
2096      default:
2097         assert( 0 );
2098      }
2099   }
2100
2101   while (pc != -1) {
2102      struct tgsi_full_instruction *instr = bld.instructions + pc;
2103      const struct tgsi_opcode_info *opcode_info =
2104         tgsi_get_opcode_info(instr->Instruction.Opcode);
2105      if (!emit_instruction( &bld, instr, opcode_info, &pc ))
2106         _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
2107                       opcode_info->mnemonic);
2108   }
2109
2110   if (0) {
2111      LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
2112      LLVMValueRef function = LLVMGetBasicBlockParent(block);
2113      debug_printf("11111111111111111111111111111 \n");
2114      tgsi_dump(tokens, 0);
2115      lp_debug_dump_value(function);
2116      debug_printf("2222222222222222222222222222 \n");
2117   }
2118   tgsi_parse_free( &parse );
2119
2120   if (0) {
2121      LLVMModuleRef module = LLVMGetGlobalParent(
2122         LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld.base.builder)));
2123      LLVMDumpModule(module);
2124
2125   }
2126
2127   FREE( bld.instructions );
2128}
2129
2130