lp_bld_tgsi_soa.c revision 58daea741fa21fe3f89fd7bf106df1545c5b21af
1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * @file
31 * TGSI to LLVM IR translation -- SoA.
32 *
33 * @author Jose Fonseca <jfonseca@vmware.com>
34 *
35 * Based on tgsi_sse2.c code written by Michal Krol, Keith Whitwell,
36 * Brian Paul, and others.
37 */
38
39#include "pipe/p_config.h"
40#include "pipe/p_shader_tokens.h"
41#include "util/u_debug.h"
42#include "util/u_math.h"
43#include "util/u_memory.h"
44#include "tgsi/tgsi_dump.h"
45#include "tgsi/tgsi_info.h"
46#include "tgsi/tgsi_parse.h"
47#include "tgsi/tgsi_util.h"
48#include "tgsi/tgsi_scan.h"
49#include "lp_bld_type.h"
50#include "lp_bld_const.h"
51#include "lp_bld_arit.h"
52#include "lp_bld_gather.h"
53#include "lp_bld_logic.h"
54#include "lp_bld_swizzle.h"
55#include "lp_bld_flow.h"
56#include "lp_bld_quad.h"
57#include "lp_bld_tgsi.h"
58#include "lp_bld_limits.h"
59#include "lp_bld_debug.h"
60
61
62#define FOR_EACH_CHANNEL( CHAN )\
63   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
64
65#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
66   ((INST)->Dst[0].Register.WriteMask & (1 << (CHAN)))
67
68#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
69   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
70
71#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
72   FOR_EACH_CHANNEL( CHAN )\
73      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
74
75#define CHAN_X 0
76#define CHAN_Y 1
77#define CHAN_Z 2
78#define CHAN_W 3
79#define NUM_CHANNELS 4
80
81#define LP_MAX_INSTRUCTIONS 256
82
83
84struct lp_exec_mask {
85   struct lp_build_context *bld;
86
87   boolean has_mask;
88
89   LLVMTypeRef int_vec_type;
90
91   LLVMValueRef cond_stack[LP_MAX_TGSI_NESTING];
92   int cond_stack_size;
93   LLVMValueRef cond_mask;
94
95   LLVMBasicBlockRef loop_block;
96   LLVMValueRef cont_mask;
97   LLVMValueRef break_mask;
98   LLVMValueRef break_var;
99   struct {
100      LLVMBasicBlockRef loop_block;
101      LLVMValueRef cont_mask;
102      LLVMValueRef break_mask;
103      LLVMValueRef break_var;
104   } loop_stack[LP_MAX_TGSI_NESTING];
105   int loop_stack_size;
106
107   LLVMValueRef ret_mask;
108   struct {
109      int pc;
110      LLVMValueRef ret_mask;
111   } call_stack[LP_MAX_TGSI_NESTING];
112   int call_stack_size;
113
114   LLVMValueRef exec_mask;
115};
116
117struct lp_build_tgsi_soa_context
118{
119   struct lp_build_context base;
120
121   /* Builder for integer masks and indices */
122   struct lp_build_context int_bld;
123
124   LLVMValueRef consts_ptr;
125   const LLVMValueRef *pos;
126   const LLVMValueRef (*inputs)[NUM_CHANNELS];
127   LLVMValueRef (*outputs)[NUM_CHANNELS];
128
129   const struct lp_build_sampler_soa *sampler;
130
131   LLVMValueRef immediates[LP_MAX_TGSI_IMMEDIATES][NUM_CHANNELS];
132   LLVMValueRef temps[LP_MAX_TGSI_TEMPS][NUM_CHANNELS];
133   LLVMValueRef addr[LP_MAX_TGSI_ADDRS][NUM_CHANNELS];
134   LLVMValueRef preds[LP_MAX_TGSI_PREDS][NUM_CHANNELS];
135
136   /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
137    * set in the indirect_files field.
138    * The temps[] array above is unused then.
139    */
140   LLVMValueRef temps_array;
141
142   /** bitmask indicating which register files are accessed indirectly */
143   unsigned indirect_files;
144
145   struct lp_build_mask_context *mask;
146   struct lp_exec_mask exec_mask;
147
148   struct tgsi_full_instruction *instructions;
149   uint max_instructions;
150};
151
152static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
153{
154   mask->bld = bld;
155   mask->has_mask = FALSE;
156   mask->cond_stack_size = 0;
157   mask->loop_stack_size = 0;
158   mask->call_stack_size = 0;
159
160   mask->int_vec_type = lp_build_int_vec_type(mask->bld->type);
161   mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask = mask->cond_mask =
162         LLVMConstAllOnes(mask->int_vec_type);
163}
164
165static void lp_exec_mask_update(struct lp_exec_mask *mask)
166{
167   if (mask->loop_stack_size) {
168      /*for loops we need to update the entire mask at runtime */
169      LLVMValueRef tmp;
170      assert(mask->break_mask);
171      tmp = LLVMBuildAnd(mask->bld->builder,
172                         mask->cont_mask,
173                         mask->break_mask,
174                         "maskcb");
175      mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
176                                     mask->cond_mask,
177                                     tmp,
178                                     "maskfull");
179   } else
180      mask->exec_mask = mask->cond_mask;
181
182   if (mask->call_stack_size) {
183      mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
184                                     mask->exec_mask,
185                                     mask->ret_mask,
186                                     "callmask");
187   }
188
189   mask->has_mask = (mask->cond_stack_size > 0 ||
190                     mask->loop_stack_size > 0 ||
191                     mask->call_stack_size > 0);
192}
193
194static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
195                                   LLVMValueRef val)
196{
197   assert(mask->cond_stack_size < LP_MAX_TGSI_NESTING);
198   if (mask->cond_stack_size == 0) {
199      assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type));
200   }
201   mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
202   assert(LLVMTypeOf(val) == mask->int_vec_type);
203   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
204                                  mask->cond_mask,
205                                  val,
206                                  "");
207   lp_exec_mask_update(mask);
208}
209
210static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
211{
212   LLVMValueRef prev_mask;
213   LLVMValueRef inv_mask;
214
215   assert(mask->cond_stack_size);
216   prev_mask = mask->cond_stack[mask->cond_stack_size - 1];
217   if (mask->cond_stack_size == 1) {
218      assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type));
219   }
220
221   inv_mask = LLVMBuildNot(mask->bld->builder, mask->cond_mask, "");
222
223   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
224                                  inv_mask,
225                                  prev_mask, "");
226   lp_exec_mask_update(mask);
227}
228
229static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
230{
231   assert(mask->cond_stack_size);
232   mask->cond_mask = mask->cond_stack[--mask->cond_stack_size];
233   lp_exec_mask_update(mask);
234}
235
236static void lp_exec_bgnloop(struct lp_exec_mask *mask)
237{
238   if (mask->loop_stack_size == 0) {
239      assert(mask->loop_block == NULL);
240      assert(mask->cont_mask == LLVMConstAllOnes(mask->int_vec_type));
241      assert(mask->break_mask == LLVMConstAllOnes(mask->int_vec_type));
242      assert(mask->break_var == NULL);
243   }
244
245   assert(mask->loop_stack_size < LP_MAX_TGSI_NESTING);
246
247   mask->loop_stack[mask->loop_stack_size].loop_block = mask->loop_block;
248   mask->loop_stack[mask->loop_stack_size].cont_mask = mask->cont_mask;
249   mask->loop_stack[mask->loop_stack_size].break_mask = mask->break_mask;
250   mask->loop_stack[mask->loop_stack_size].break_var = mask->break_var;
251   ++mask->loop_stack_size;
252
253   mask->break_var = lp_build_alloca(mask->bld->builder, mask->int_vec_type, "");
254   LLVMBuildStore(mask->bld->builder, mask->break_mask, mask->break_var);
255
256   mask->loop_block = lp_build_insert_new_block(mask->bld->builder, "bgnloop");
257   LLVMBuildBr(mask->bld->builder, mask->loop_block);
258   LLVMPositionBuilderAtEnd(mask->bld->builder, mask->loop_block);
259
260   mask->break_mask = LLVMBuildLoad(mask->bld->builder, mask->break_var, "");
261
262   lp_exec_mask_update(mask);
263}
264
265static void lp_exec_break(struct lp_exec_mask *mask)
266{
267   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
268                                         mask->exec_mask,
269                                         "break");
270
271   mask->break_mask = LLVMBuildAnd(mask->bld->builder,
272                                   mask->break_mask,
273                                   exec_mask, "break_full");
274
275   lp_exec_mask_update(mask);
276}
277
278static void lp_exec_continue(struct lp_exec_mask *mask)
279{
280   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
281                                         mask->exec_mask,
282                                         "");
283
284   mask->cont_mask = LLVMBuildAnd(mask->bld->builder,
285                                  mask->cont_mask,
286                                  exec_mask, "");
287
288   lp_exec_mask_update(mask);
289}
290
291
292static void lp_exec_endloop(struct lp_exec_mask *mask)
293{
294   LLVMBasicBlockRef endloop;
295   LLVMTypeRef reg_type = LLVMIntType(mask->bld->type.width*
296                                      mask->bld->type.length);
297   LLVMValueRef i1cond;
298
299   assert(mask->break_mask);
300
301   /*
302    * Restore the cont_mask, but don't pop
303    */
304   assert(mask->loop_stack_size);
305   mask->cont_mask = mask->loop_stack[mask->loop_stack_size - 1].cont_mask;
306   lp_exec_mask_update(mask);
307
308   /*
309    * Unlike the continue mask, the break_mask must be preserved across loop
310    * iterations
311    */
312   LLVMBuildStore(mask->bld->builder, mask->break_mask, mask->break_var);
313
314   /* i1cond = (mask == 0) */
315   i1cond = LLVMBuildICmp(
316      mask->bld->builder,
317      LLVMIntNE,
318      LLVMBuildBitCast(mask->bld->builder, mask->exec_mask, reg_type, ""),
319      LLVMConstNull(reg_type), "");
320
321   endloop = lp_build_insert_new_block(mask->bld->builder, "endloop");
322
323   LLVMBuildCondBr(mask->bld->builder,
324                   i1cond, mask->loop_block, endloop);
325
326   LLVMPositionBuilderAtEnd(mask->bld->builder, endloop);
327
328   assert(mask->loop_stack_size);
329   --mask->loop_stack_size;
330   mask->loop_block = mask->loop_stack[mask->loop_stack_size].loop_block;
331   mask->cont_mask = mask->loop_stack[mask->loop_stack_size].cont_mask;
332   mask->break_mask = mask->loop_stack[mask->loop_stack_size].break_mask;
333   mask->break_var = mask->loop_stack[mask->loop_stack_size].break_var;
334
335   lp_exec_mask_update(mask);
336}
337
338/* stores val into an address pointed to by dst.
339 * mask->exec_mask is used to figure out which bits of val
340 * should be stored into the address
341 * (0 means don't store this bit, 1 means do store).
342 */
343static void lp_exec_mask_store(struct lp_exec_mask *mask,
344                               LLVMValueRef pred,
345                               LLVMValueRef val,
346                               LLVMValueRef dst)
347{
348   /* Mix the predicate and execution mask */
349   if (mask->has_mask) {
350      if (pred) {
351         pred = LLVMBuildAnd(mask->bld->builder, pred, mask->exec_mask, "");
352      } else {
353         pred = mask->exec_mask;
354      }
355   }
356
357   if (pred) {
358      LLVMValueRef real_val, dst_val;
359
360      dst_val = LLVMBuildLoad(mask->bld->builder, dst, "");
361      real_val = lp_build_select(mask->bld,
362                                 pred,
363                                 val, dst_val);
364
365      LLVMBuildStore(mask->bld->builder, real_val, dst);
366   } else
367      LLVMBuildStore(mask->bld->builder, val, dst);
368}
369
370static void lp_exec_mask_call(struct lp_exec_mask *mask,
371                              int func,
372                              int *pc)
373{
374   assert(mask->call_stack_size < LP_MAX_TGSI_NESTING);
375   mask->call_stack[mask->call_stack_size].pc = *pc;
376   mask->call_stack[mask->call_stack_size].ret_mask = mask->ret_mask;
377   mask->call_stack_size++;
378   *pc = func;
379}
380
381static void lp_exec_mask_ret(struct lp_exec_mask *mask, int *pc)
382{
383   LLVMValueRef exec_mask;
384
385   if (mask->call_stack_size == 0) {
386      /* returning from main() */
387      *pc = -1;
388      return;
389   }
390   exec_mask = LLVMBuildNot(mask->bld->builder,
391                            mask->exec_mask,
392                            "ret");
393
394   mask->ret_mask = LLVMBuildAnd(mask->bld->builder,
395                                 mask->ret_mask,
396                                 exec_mask, "ret_full");
397
398   lp_exec_mask_update(mask);
399}
400
401static void lp_exec_mask_bgnsub(struct lp_exec_mask *mask)
402{
403}
404
405static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
406{
407   assert(mask->call_stack_size);
408   mask->call_stack_size--;
409   *pc = mask->call_stack[mask->call_stack_size].pc;
410   mask->ret_mask = mask->call_stack[mask->call_stack_size].ret_mask;
411   lp_exec_mask_update(mask);
412}
413
414
415/**
416 * Return pointer to a temporary register channel (src or dest).
417 * Note that indirect addressing cannot be handled here.
418 * \param index  which temporary register
419 * \param chan  which channel of the temp register.
420 */
421static LLVMValueRef
422get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
423             unsigned index,
424             unsigned chan)
425{
426   assert(chan < 4);
427   if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
428      LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan);
429      return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, "");
430   }
431   else {
432      return bld->temps[index][chan];
433   }
434}
435
436
437/**
438 * Gather vector.
439 * XXX the lp_build_gather() function should be capable of doing this
440 * with a little work.
441 */
442static LLVMValueRef
443build_gather(struct lp_build_tgsi_soa_context *bld,
444             LLVMValueRef base_ptr,
445             LLVMValueRef indexes)
446{
447   LLVMValueRef res = bld->base.undef;
448   unsigned i;
449
450   /*
451    * Loop over elements of index_vec, load scalar value, insert it into 'res'.
452    */
453   for (i = 0; i < bld->base.type.length; i++) {
454      LLVMValueRef ii = LLVMConstInt(LLVMInt32Type(), i, 0);
455      LLVMValueRef index = LLVMBuildExtractElement(bld->base.builder,
456                                                   indexes, ii, "");
457      LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, base_ptr,
458                                             &index, 1, "");
459      LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
460
461      res = LLVMBuildInsertElement(bld->base.builder, res, scalar, ii, "");
462   }
463
464   return res;
465}
466
467
468/**
469 * Read the current value of the ADDR register, convert the floats to
470 * ints, multiply by four and return the vector of offsets.
471 * The offsets will be used to index into the constant buffer or
472 * temporary register file.
473 */
474static LLVMValueRef
475get_indirect_offsets(struct lp_build_tgsi_soa_context *bld,
476                     const struct tgsi_src_register *indirect_reg)
477{
478   /* always use X component of address register */
479   const int x = indirect_reg->SwizzleX;
480   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
481   uint swizzle = tgsi_util_get_src_register_swizzle(indirect_reg, x);
482   LLVMValueRef vec4 = lp_build_const_int_vec(bld->int_bld.type, 4);
483   LLVMValueRef addr_vec;
484
485   addr_vec = LLVMBuildLoad(bld->base.builder,
486                            bld->addr[indirect_reg->Index][swizzle],
487                            "load addr reg");
488
489   /* for indexing we want integers */
490   addr_vec = LLVMBuildFPToSI(bld->base.builder, addr_vec,
491                              int_vec_type, "");
492
493   /* addr_vec = addr_vec * 4 */
494   addr_vec = lp_build_mul(&bld->int_bld, addr_vec, vec4);
495
496   return addr_vec;
497}
498
499
500/**
501 * Register fetch.
502 */
503static LLVMValueRef
504emit_fetch(
505   struct lp_build_tgsi_soa_context *bld,
506   const struct tgsi_full_instruction *inst,
507   unsigned src_op,
508   const unsigned chan_index )
509{
510   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
511   const unsigned swizzle =
512      tgsi_util_get_full_src_register_swizzle(reg, chan_index);
513   LLVMValueRef res;
514   LLVMValueRef addr_vec = NULL;
515
516   if (swizzle > 3) {
517      assert(0 && "invalid swizzle in emit_fetch()");
518      return bld->base.undef;
519   }
520
521   if (reg->Register.Indirect) {
522      assert(bld->indirect_files);
523      addr_vec = get_indirect_offsets(bld, &reg->Indirect);
524   }
525
526   switch (reg->Register.File) {
527   case TGSI_FILE_CONSTANT:
528      if (reg->Register.Indirect) {
529         LLVMValueRef index_vec;  /* index into the const buffer */
530
531         assert(bld->indirect_files & (1 << TGSI_FILE_CONSTANT));
532
533         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
534         index_vec = lp_build_const_int_vec(bld->int_bld.type,
535                                            reg->Register.Index * 4 + swizzle);
536
537         /* index_vec = index_vec + addr_vec */
538         index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec);
539
540         /* Gather values from the constant buffer */
541         res = build_gather(bld, bld->consts_ptr, index_vec);
542      }
543      else {
544         LLVMValueRef index;  /* index into the const buffer */
545         LLVMValueRef scalar, scalar_ptr;
546
547         index = lp_build_const_int32(reg->Register.Index*4 + swizzle);
548
549         scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr,
550                                   &index, 1, "");
551         scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
552
553         res = lp_build_broadcast_scalar(&bld->base, scalar);
554      }
555      break;
556
557   case TGSI_FILE_IMMEDIATE:
558      res = bld->immediates[reg->Register.Index][swizzle];
559      assert(res);
560      break;
561
562   case TGSI_FILE_INPUT:
563      res = bld->inputs[reg->Register.Index][swizzle];
564      assert(res);
565      break;
566
567   case TGSI_FILE_TEMPORARY:
568      if (reg->Register.Indirect) {
569         LLVMValueRef vec_len =
570            lp_build_const_int_vec(bld->int_bld.type, bld->base.type.length);
571         LLVMValueRef index_vec;  /* index into the const buffer */
572         LLVMValueRef temps_array;
573         LLVMTypeRef float4_ptr_type;
574
575         assert(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY));
576
577         /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */
578         index_vec = lp_build_const_int_vec(bld->int_bld.type,
579                                            reg->Register.Index * 4 + swizzle);
580
581         /* index_vec += addr_vec */
582         index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec);
583
584         /* index_vec *= vector_length */
585         index_vec = lp_build_mul(&bld->int_bld, index_vec, vec_len);
586
587         /* cast temps_array pointer to float* */
588         float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
589         temps_array = LLVMBuildBitCast(bld->int_bld.builder, bld->temps_array,
590                                        float4_ptr_type, "");
591
592         /* Gather values from the temporary register array */
593         res = build_gather(bld, temps_array, index_vec);
594      }
595      else {
596         LLVMValueRef temp_ptr;
597         temp_ptr = get_temp_ptr(bld, reg->Register.Index, swizzle);
598         res = LLVMBuildLoad(bld->base.builder, temp_ptr, "");
599         if (!res)
600            return bld->base.undef;
601      }
602      break;
603
604   default:
605      assert(0 && "invalid src register in emit_fetch()");
606      return bld->base.undef;
607   }
608
609   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
610   case TGSI_UTIL_SIGN_CLEAR:
611      res = lp_build_abs( &bld->base, res );
612      break;
613
614   case TGSI_UTIL_SIGN_SET:
615      res = lp_build_abs( &bld->base, res );
616      /* fall through */
617   case TGSI_UTIL_SIGN_TOGGLE:
618      res = lp_build_negate( &bld->base, res );
619      break;
620
621   case TGSI_UTIL_SIGN_KEEP:
622      break;
623   }
624
625   return res;
626}
627
628
629/**
630 * Register fetch with derivatives.
631 */
632static void
633emit_fetch_deriv(
634   struct lp_build_tgsi_soa_context *bld,
635   const struct tgsi_full_instruction *inst,
636   unsigned index,
637   const unsigned chan_index,
638   LLVMValueRef *res,
639   LLVMValueRef *ddx,
640   LLVMValueRef *ddy)
641{
642   LLVMValueRef src;
643
644   src = emit_fetch(bld, inst, index, chan_index);
645
646   if(res)
647      *res = src;
648
649   /* TODO: use interpolation coeffs for inputs */
650
651   if(ddx)
652      *ddx = lp_build_ddx(&bld->base, src);
653
654   if(ddy)
655      *ddy = lp_build_ddy(&bld->base, src);
656}
657
658
659/**
660 * Predicate.
661 */
662static void
663emit_fetch_predicate(
664   struct lp_build_tgsi_soa_context *bld,
665   const struct tgsi_full_instruction *inst,
666   LLVMValueRef *pred)
667{
668   unsigned index;
669   unsigned char swizzles[4];
670   LLVMValueRef unswizzled[4] = {NULL, NULL, NULL, NULL};
671   LLVMValueRef value;
672   unsigned chan;
673
674   if (!inst->Instruction.Predicate) {
675      FOR_EACH_CHANNEL( chan ) {
676         pred[chan] = NULL;
677      }
678      return;
679   }
680
681   swizzles[0] = inst->Predicate.SwizzleX;
682   swizzles[1] = inst->Predicate.SwizzleY;
683   swizzles[2] = inst->Predicate.SwizzleZ;
684   swizzles[3] = inst->Predicate.SwizzleW;
685
686   index = inst->Predicate.Index;
687   assert(index < LP_MAX_TGSI_PREDS);
688
689   FOR_EACH_CHANNEL( chan ) {
690      unsigned swizzle = swizzles[chan];
691
692      /*
693       * Only fetch the predicate register channels that are actually listed
694       * in the swizzles
695       */
696      if (!unswizzled[swizzle]) {
697         value = LLVMBuildLoad(bld->base.builder,
698                               bld->preds[index][swizzle], "");
699
700         /*
701          * Convert the value to an integer mask.
702          *
703          * TODO: Short-circuit this comparison -- a D3D setp_xx instructions
704          * is needlessly causing two comparisons due to storing the intermediate
705          * result as float vector instead of an integer mask vector.
706          */
707         value = lp_build_compare(bld->base.builder,
708                                  bld->base.type,
709                                  PIPE_FUNC_NOTEQUAL,
710                                  value,
711                                  bld->base.zero);
712         if (inst->Predicate.Negate) {
713            value = LLVMBuildNot(bld->base.builder, value, "");
714         }
715
716         unswizzled[swizzle] = value;
717      } else {
718         value = unswizzled[swizzle];
719      }
720
721      pred[chan] = value;
722   }
723}
724
725
726/**
727 * Register store.
728 */
729static void
730emit_store(
731   struct lp_build_tgsi_soa_context *bld,
732   const struct tgsi_full_instruction *inst,
733   unsigned index,
734   unsigned chan_index,
735   LLVMValueRef pred,
736   LLVMValueRef value)
737{
738   const struct tgsi_full_dst_register *reg = &inst->Dst[index];
739   LLVMValueRef addr = NULL;
740
741   switch( inst->Instruction.Saturate ) {
742   case TGSI_SAT_NONE:
743      break;
744
745   case TGSI_SAT_ZERO_ONE:
746      value = lp_build_max(&bld->base, value, bld->base.zero);
747      value = lp_build_min(&bld->base, value, bld->base.one);
748      break;
749
750   case TGSI_SAT_MINUS_PLUS_ONE:
751      value = lp_build_max(&bld->base, value, lp_build_const_vec(bld->base.type, -1.0));
752      value = lp_build_min(&bld->base, value, bld->base.one);
753      break;
754
755   default:
756      assert(0);
757   }
758
759   if (reg->Register.Indirect) {
760      /* XXX use get_indirect_offsets() here eventually */
761      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
762      unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
763
764      assert(bld->indirect_files);
765
766      addr = LLVMBuildLoad(bld->base.builder,
767                           bld->addr[reg->Indirect.Index][swizzle],
768                           "");
769      /* for indexing we want integers */
770      addr = LLVMBuildFPToSI(bld->base.builder, addr,
771                             int_vec_type, "");
772      addr = LLVMBuildExtractElement(bld->base.builder,
773                                     addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
774                                     "");
775      addr = LLVMBuildMul(bld->base.builder,
776                          addr, LLVMConstInt(LLVMInt32Type(), 4, 0),
777                          "");
778   }
779
780   switch( reg->Register.File ) {
781   case TGSI_FILE_OUTPUT:
782      lp_exec_mask_store(&bld->exec_mask, pred, value,
783                         bld->outputs[reg->Register.Index][chan_index]);
784      break;
785
786   case TGSI_FILE_TEMPORARY:
787      if (reg->Register.Indirect) {
788         /* XXX not done yet */
789         debug_printf("WARNING: LLVM scatter store of temp regs"
790                      " not implemented\n");
791      }
792      else {
793         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
794                                              chan_index);
795         lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr);
796      }
797      break;
798
799   case TGSI_FILE_ADDRESS:
800      lp_exec_mask_store(&bld->exec_mask, pred, value,
801                         bld->addr[reg->Indirect.Index][chan_index]);
802      break;
803
804   case TGSI_FILE_PREDICATE:
805      lp_exec_mask_store(&bld->exec_mask, pred, value,
806                         bld->preds[reg->Register.Index][chan_index]);
807      break;
808
809   default:
810      assert( 0 );
811   }
812}
813
814
815/**
816 * High-level instruction translators.
817 */
818
819static void
820emit_tex( struct lp_build_tgsi_soa_context *bld,
821          const struct tgsi_full_instruction *inst,
822          enum lp_build_tex_modifier modifier,
823          LLVMValueRef *texel)
824{
825   unsigned unit;
826   LLVMValueRef lod_bias, explicit_lod;
827   LLVMValueRef oow = NULL;
828   LLVMValueRef coords[3];
829   LLVMValueRef ddx[3];
830   LLVMValueRef ddy[3];
831   unsigned num_coords;
832   unsigned i;
833
834   if (!bld->sampler) {
835      _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
836      for (i = 0; i < 4; i++) {
837         texel[i] = bld->base.undef;
838      }
839      return;
840   }
841
842   switch (inst->Texture.Texture) {
843   case TGSI_TEXTURE_1D:
844      num_coords = 1;
845      break;
846   case TGSI_TEXTURE_2D:
847   case TGSI_TEXTURE_RECT:
848      num_coords = 2;
849      break;
850   case TGSI_TEXTURE_SHADOW1D:
851   case TGSI_TEXTURE_SHADOW2D:
852   case TGSI_TEXTURE_SHADOWRECT:
853   case TGSI_TEXTURE_3D:
854   case TGSI_TEXTURE_CUBE:
855      num_coords = 3;
856      break;
857   default:
858      assert(0);
859      return;
860   }
861
862   if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
863      lod_bias = emit_fetch( bld, inst, 0, 3 );
864      explicit_lod = NULL;
865   }
866   else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
867      lod_bias = NULL;
868      explicit_lod = emit_fetch( bld, inst, 0, 3 );
869   }
870   else {
871      lod_bias = NULL;
872      explicit_lod = NULL;
873   }
874
875   if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) {
876      oow = emit_fetch( bld, inst, 0, 3 );
877      oow = lp_build_rcp(&bld->base, oow);
878   }
879
880   for (i = 0; i < num_coords; i++) {
881      coords[i] = emit_fetch( bld, inst, 0, i );
882      if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
883         coords[i] = lp_build_mul(&bld->base, coords[i], oow);
884   }
885   for (i = num_coords; i < 3; i++) {
886      coords[i] = bld->base.undef;
887   }
888
889   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
890      for (i = 0; i < num_coords; i++) {
891         ddx[i] = emit_fetch( bld, inst, 1, i );
892         ddy[i] = emit_fetch( bld, inst, 2, i );
893      }
894      unit = inst->Src[3].Register.Index;
895   }  else {
896      for (i = 0; i < num_coords; i++) {
897         ddx[i] = lp_build_ddx( &bld->base, coords[i] );
898         ddy[i] = lp_build_ddy( &bld->base, coords[i] );
899      }
900      unit = inst->Src[1].Register.Index;
901   }
902   for (i = num_coords; i < 3; i++) {
903      ddx[i] = bld->base.undef;
904      ddy[i] = bld->base.undef;
905   }
906
907   bld->sampler->emit_fetch_texel(bld->sampler,
908                                  bld->base.builder,
909                                  bld->base.type,
910                                  unit, num_coords, coords,
911                                  ddx, ddy,
912                                  lod_bias, explicit_lod,
913                                  texel);
914}
915
916
917/**
918 * Kill fragment if any of the src register values are negative.
919 */
920static void
921emit_kil(
922   struct lp_build_tgsi_soa_context *bld,
923   const struct tgsi_full_instruction *inst )
924{
925   const struct tgsi_full_src_register *reg = &inst->Src[0];
926   LLVMValueRef terms[NUM_CHANNELS];
927   LLVMValueRef mask;
928   unsigned chan_index;
929
930   memset(&terms, 0, sizeof terms);
931
932   FOR_EACH_CHANNEL( chan_index ) {
933      unsigned swizzle;
934
935      /* Unswizzle channel */
936      swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
937
938      /* Check if the component has not been already tested. */
939      assert(swizzle < NUM_CHANNELS);
940      if( !terms[swizzle] )
941         /* TODO: change the comparison operator instead of setting the sign */
942         terms[swizzle] =  emit_fetch(bld, inst, 0, chan_index );
943   }
944
945   mask = NULL;
946   FOR_EACH_CHANNEL( chan_index ) {
947      if(terms[chan_index]) {
948         LLVMValueRef chan_mask;
949
950         /*
951          * If term < 0 then mask = 0 else mask = ~0.
952          */
953         chan_mask = lp_build_cmp(&bld->base, PIPE_FUNC_GEQUAL, terms[chan_index], bld->base.zero);
954
955         if(mask)
956            mask = LLVMBuildAnd(bld->base.builder, mask, chan_mask, "");
957         else
958            mask = chan_mask;
959      }
960   }
961
962   if(mask)
963      lp_build_mask_update(bld->mask, mask);
964}
965
966
967/**
968 * Predicated fragment kill.
969 * XXX Actually, we do an unconditional kill (as in tgsi_exec.c).
970 * The only predication is the execution mask which will apply if
971 * we're inside a loop or conditional.
972 */
973static void
974emit_kilp(struct lp_build_tgsi_soa_context *bld,
975          const struct tgsi_full_instruction *inst)
976{
977   LLVMValueRef mask;
978
979   /* For those channels which are "alive", disable fragment shader
980    * execution.
981    */
982   if (bld->exec_mask.has_mask) {
983      mask = LLVMBuildNot(bld->base.builder, bld->exec_mask.exec_mask, "kilp");
984   }
985   else {
986      mask = bld->base.zero;
987   }
988
989   lp_build_mask_update(bld->mask, mask);
990}
991
992static void
993emit_declaration(
994   struct lp_build_tgsi_soa_context *bld,
995   const struct tgsi_full_declaration *decl)
996{
997   LLVMTypeRef vec_type = lp_build_vec_type(bld->base.type);
998
999   unsigned first = decl->Range.First;
1000   unsigned last = decl->Range.Last;
1001   unsigned idx, i;
1002
1003   for (idx = first; idx <= last; ++idx) {
1004      switch (decl->Declaration.File) {
1005      case TGSI_FILE_TEMPORARY:
1006         assert(idx < LP_MAX_TGSI_TEMPS);
1007         if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1008            LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
1009                                                   last*4 + 4, 0);
1010            bld->temps_array = lp_build_array_alloca(bld->base.builder,
1011                                                     vec_type, array_size, "");
1012         } else {
1013            for (i = 0; i < NUM_CHANNELS; i++)
1014               bld->temps[idx][i] = lp_build_alloca(bld->base.builder,
1015                                                    vec_type, "");
1016         }
1017         break;
1018
1019      case TGSI_FILE_OUTPUT:
1020         for (i = 0; i < NUM_CHANNELS; i++)
1021            bld->outputs[idx][i] = lp_build_alloca(bld->base.builder,
1022                                                   vec_type, "");
1023         break;
1024
1025      case TGSI_FILE_ADDRESS:
1026         assert(idx < LP_MAX_TGSI_ADDRS);
1027         for (i = 0; i < NUM_CHANNELS; i++)
1028            bld->addr[idx][i] = lp_build_alloca(bld->base.builder,
1029                                                vec_type, "");
1030         break;
1031
1032      case TGSI_FILE_PREDICATE:
1033         assert(idx < LP_MAX_TGSI_PREDS);
1034         for (i = 0; i < NUM_CHANNELS; i++)
1035            bld->preds[idx][i] = lp_build_alloca(bld->base.builder,
1036                                                 vec_type, "");
1037         break;
1038
1039      default:
1040         /* don't need to declare other vars */
1041         break;
1042      }
1043   }
1044}
1045
1046
1047/**
1048 * Emit LLVM for one TGSI instruction.
1049 * \param return TRUE for success, FALSE otherwise
1050 */
1051static boolean
1052emit_instruction(
1053   struct lp_build_tgsi_soa_context *bld,
1054   const struct tgsi_full_instruction *inst,
1055   const struct tgsi_opcode_info *info,
1056   int *pc)
1057{
1058   unsigned chan_index;
1059   LLVMValueRef src0, src1, src2;
1060   LLVMValueRef tmp0, tmp1, tmp2;
1061   LLVMValueRef tmp3 = NULL;
1062   LLVMValueRef tmp4 = NULL;
1063   LLVMValueRef tmp5 = NULL;
1064   LLVMValueRef tmp6 = NULL;
1065   LLVMValueRef tmp7 = NULL;
1066   LLVMValueRef res;
1067   LLVMValueRef dst0[NUM_CHANNELS];
1068
1069   /*
1070    * Stores and write masks are handled in a general fashion after the long
1071    * instruction opcode switch statement.
1072    *
1073    * Although not stricitly necessary, we avoid generating instructions for
1074    * channels which won't be stored, in cases where's that easy. For some
1075    * complex instructions, like texture sampling, it is more convenient to
1076    * assume a full writemask and then let LLVM optimization passes eliminate
1077    * redundant code.
1078    */
1079
1080   (*pc)++;
1081
1082   assert(info->num_dst <= 1);
1083   if (info->num_dst) {
1084      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1085         dst0[chan_index] = bld->base.undef;
1086      }
1087   }
1088
1089   switch (inst->Instruction.Opcode) {
1090   case TGSI_OPCODE_ARL:
1091      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1092         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1093         tmp0 = lp_build_floor(&bld->base, tmp0);
1094         dst0[chan_index] = tmp0;
1095      }
1096      break;
1097
1098   case TGSI_OPCODE_MOV:
1099      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1100         dst0[chan_index] = emit_fetch( bld, inst, 0, chan_index );
1101      }
1102      break;
1103
1104   case TGSI_OPCODE_LIT:
1105      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ) {
1106         dst0[CHAN_X] = bld->base.one;
1107      }
1108      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
1109         src0 = emit_fetch( bld, inst, 0, CHAN_X );
1110         dst0[CHAN_Y] = lp_build_max( &bld->base, src0, bld->base.zero);
1111      }
1112      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
1113         /* XMM[1] = SrcReg[0].yyyy */
1114         tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1115         /* XMM[1] = max(XMM[1], 0) */
1116         tmp1 = lp_build_max( &bld->base, tmp1, bld->base.zero);
1117         /* XMM[2] = SrcReg[0].wwww */
1118         tmp2 = emit_fetch( bld, inst, 0, CHAN_W );
1119         tmp1 = lp_build_pow( &bld->base, tmp1, tmp2);
1120         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1121         tmp2 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, tmp0, bld->base.zero);
1122         dst0[CHAN_Z] = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
1123      }
1124      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) ) {
1125         dst0[CHAN_W] = bld->base.one;
1126      }
1127      break;
1128
1129   case TGSI_OPCODE_RCP:
1130   /* TGSI_OPCODE_RECIP */
1131      src0 = emit_fetch( bld, inst, 0, CHAN_X );
1132      res = lp_build_rcp(&bld->base, src0);
1133      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1134         dst0[chan_index] = res;
1135      }
1136      break;
1137
1138   case TGSI_OPCODE_RSQ:
1139   /* TGSI_OPCODE_RECIPSQRT */
1140      src0 = emit_fetch( bld, inst, 0, CHAN_X );
1141      src0 = lp_build_abs(&bld->base, src0);
1142      res = lp_build_rsqrt(&bld->base, src0);
1143      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1144         dst0[chan_index] = res;
1145      }
1146      break;
1147
1148   case TGSI_OPCODE_EXP:
1149      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1150          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
1151          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
1152         LLVMValueRef *p_exp2_int_part = NULL;
1153         LLVMValueRef *p_frac_part = NULL;
1154         LLVMValueRef *p_exp2 = NULL;
1155
1156         src0 = emit_fetch( bld, inst, 0, CHAN_X );
1157
1158         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1159            p_exp2_int_part = &tmp0;
1160         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
1161            p_frac_part = &tmp1;
1162         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1163            p_exp2 = &tmp2;
1164
1165         lp_build_exp2_approx(&bld->base, src0, p_exp2_int_part, p_frac_part, p_exp2);
1166
1167         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1168            dst0[CHAN_X] = tmp0;
1169         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
1170            dst0[CHAN_Y] = tmp1;
1171         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1172            dst0[CHAN_Z] = tmp2;
1173      }
1174      /* dst.w = 1.0 */
1175      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
1176         dst0[CHAN_W] = bld->base.one;
1177      }
1178      break;
1179
1180   case TGSI_OPCODE_LOG:
1181      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1182          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
1183          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
1184         LLVMValueRef *p_floor_log2 = NULL;
1185         LLVMValueRef *p_exp = NULL;
1186         LLVMValueRef *p_log2 = NULL;
1187
1188         src0 = emit_fetch( bld, inst, 0, CHAN_X );
1189         src0 = lp_build_abs( &bld->base, src0 );
1190
1191         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1192            p_floor_log2 = &tmp0;
1193         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
1194            p_exp = &tmp1;
1195         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1196            p_log2 = &tmp2;
1197
1198         lp_build_log2_approx(&bld->base, src0, p_exp, p_floor_log2, p_log2);
1199
1200         /* dst.x = floor(lg2(abs(src.x))) */
1201         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1202            dst0[CHAN_X] = tmp0;
1203         /* dst.y = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1204         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y )) {
1205            dst0[CHAN_Y] = lp_build_div( &bld->base, src0, tmp1);
1206         }
1207         /* dst.z = lg2(abs(src.x)) */
1208         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1209            dst0[CHAN_Z] = tmp2;
1210      }
1211      /* dst.w = 1.0 */
1212      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
1213         dst0[CHAN_W] = bld->base.one;
1214      }
1215      break;
1216
1217   case TGSI_OPCODE_MUL:
1218      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1219         src0 = emit_fetch( bld, inst, 0, chan_index );
1220         src1 = emit_fetch( bld, inst, 1, chan_index );
1221         dst0[chan_index] = lp_build_mul(&bld->base, src0, src1);
1222      }
1223      break;
1224
1225   case TGSI_OPCODE_ADD:
1226      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1227         src0 = emit_fetch( bld, inst, 0, chan_index );
1228         src1 = emit_fetch( bld, inst, 1, chan_index );
1229         dst0[chan_index] = lp_build_add(&bld->base, src0, src1);
1230      }
1231      break;
1232
1233   case TGSI_OPCODE_DP3:
1234   /* TGSI_OPCODE_DOT3 */
1235      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1236      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
1237      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1238      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1239      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
1240      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1241      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1242      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
1243      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
1244      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1245      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1246      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1247         dst0[chan_index] = tmp0;
1248      }
1249      break;
1250
1251   case TGSI_OPCODE_DP4:
1252   /* TGSI_OPCODE_DOT4 */
1253      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1254      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
1255      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1256      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1257      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
1258      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1259      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1260      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
1261      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
1262      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1263      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1264      tmp1 = emit_fetch( bld, inst, 0, CHAN_W );
1265      tmp2 = emit_fetch( bld, inst, 1, CHAN_W );
1266      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1267      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1268      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1269         dst0[chan_index] = tmp0;
1270      }
1271      break;
1272
1273   case TGSI_OPCODE_DST:
1274      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
1275         dst0[CHAN_X] = bld->base.one;
1276      }
1277      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
1278         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
1279         tmp1 = emit_fetch( bld, inst, 1, CHAN_Y );
1280         dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp0, tmp1);
1281      }
1282      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
1283         dst0[CHAN_Z] = emit_fetch( bld, inst, 0, CHAN_Z );
1284      }
1285      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
1286         dst0[CHAN_W] = emit_fetch( bld, inst, 1, CHAN_W );
1287      }
1288      break;
1289
1290   case TGSI_OPCODE_MIN:
1291      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1292         src0 = emit_fetch( bld, inst, 0, chan_index );
1293         src1 = emit_fetch( bld, inst, 1, chan_index );
1294         dst0[chan_index] = lp_build_min( &bld->base, src0, src1 );
1295      }
1296      break;
1297
1298   case TGSI_OPCODE_MAX:
1299      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1300         src0 = emit_fetch( bld, inst, 0, chan_index );
1301         src1 = emit_fetch( bld, inst, 1, chan_index );
1302         dst0[chan_index] = lp_build_max( &bld->base, src0, src1 );
1303      }
1304      break;
1305
1306   case TGSI_OPCODE_SLT:
1307   /* TGSI_OPCODE_SETLT */
1308      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1309         src0 = emit_fetch( bld, inst, 0, chan_index );
1310         src1 = emit_fetch( bld, inst, 1, chan_index );
1311         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, src1 );
1312         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1313      }
1314      break;
1315
1316   case TGSI_OPCODE_SGE:
1317   /* TGSI_OPCODE_SETGE */
1318      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1319         src0 = emit_fetch( bld, inst, 0, chan_index );
1320         src1 = emit_fetch( bld, inst, 1, chan_index );
1321         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GEQUAL, src0, src1 );
1322         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1323      }
1324      break;
1325
1326   case TGSI_OPCODE_MAD:
1327   /* TGSI_OPCODE_MADD */
1328      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1329         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1330         tmp1 = emit_fetch( bld, inst, 1, chan_index );
1331         tmp2 = emit_fetch( bld, inst, 2, chan_index );
1332         tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1333         tmp0 = lp_build_add( &bld->base, tmp0, tmp2);
1334         dst0[chan_index] = tmp0;
1335      }
1336      break;
1337
1338   case TGSI_OPCODE_SUB:
1339      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1340         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1341         tmp1 = emit_fetch( bld, inst, 1, chan_index );
1342         dst0[chan_index] = lp_build_sub( &bld->base, tmp0, tmp1);
1343      }
1344      break;
1345
1346   case TGSI_OPCODE_LRP:
1347      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1348         src0 = emit_fetch( bld, inst, 0, chan_index );
1349         src1 = emit_fetch( bld, inst, 1, chan_index );
1350         src2 = emit_fetch( bld, inst, 2, chan_index );
1351         tmp0 = lp_build_sub( &bld->base, src1, src2 );
1352         tmp0 = lp_build_mul( &bld->base, src0, tmp0 );
1353         dst0[chan_index] = lp_build_add( &bld->base, tmp0, src2 );
1354      }
1355      break;
1356
1357   case TGSI_OPCODE_CND:
1358      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1359         src0 = emit_fetch( bld, inst, 0, chan_index );
1360         src1 = emit_fetch( bld, inst, 1, chan_index );
1361         src2 = emit_fetch( bld, inst, 2, chan_index );
1362         tmp1 = lp_build_const_vec(bld->base.type, 0.5);
1363         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
1364         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
1365      }
1366      break;
1367
1368   case TGSI_OPCODE_DP2A:
1369      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
1370      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
1371      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
1372      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
1373      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
1374      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
1375      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
1376      tmp1 = emit_fetch( bld, inst, 2, CHAN_X );  /* xmm1 = src[2].x */
1377      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
1378      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1379         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
1380      }
1381      break;
1382
1383   case TGSI_OPCODE_FRC:
1384      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1385         src0 = emit_fetch( bld, inst, 0, chan_index );
1386         tmp0 = lp_build_floor(&bld->base, src0);
1387         tmp0 = lp_build_sub(&bld->base, src0, tmp0);
1388         dst0[chan_index] = tmp0;
1389      }
1390      break;
1391
1392   case TGSI_OPCODE_CLAMP:
1393      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1394         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1395         src1 = emit_fetch( bld, inst, 1, chan_index );
1396         src2 = emit_fetch( bld, inst, 2, chan_index );
1397         tmp0 = lp_build_max(&bld->base, tmp0, src1);
1398         tmp0 = lp_build_min(&bld->base, tmp0, src2);
1399         dst0[chan_index] = tmp0;
1400      }
1401      break;
1402
1403   case TGSI_OPCODE_FLR:
1404      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1405         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1406         dst0[chan_index] = lp_build_floor(&bld->base, tmp0);
1407      }
1408      break;
1409
1410   case TGSI_OPCODE_ROUND:
1411      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1412         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1413         dst0[chan_index] = lp_build_round(&bld->base, tmp0);
1414      }
1415      break;
1416
1417   case TGSI_OPCODE_EX2: {
1418      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1419      tmp0 = lp_build_exp2( &bld->base, tmp0);
1420      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1421         dst0[chan_index] = tmp0;
1422      }
1423      break;
1424   }
1425
1426   case TGSI_OPCODE_LG2:
1427      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1428      tmp0 = lp_build_log2( &bld->base, tmp0);
1429      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1430         dst0[chan_index] = tmp0;
1431      }
1432      break;
1433
1434   case TGSI_OPCODE_POW:
1435      src0 = emit_fetch( bld, inst, 0, CHAN_X );
1436      src1 = emit_fetch( bld, inst, 1, CHAN_X );
1437      res = lp_build_pow( &bld->base, src0, src1 );
1438      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1439         dst0[chan_index] = res;
1440      }
1441      break;
1442
1443   case TGSI_OPCODE_XPD:
1444      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1445          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
1446         tmp1 = emit_fetch( bld, inst, 1, CHAN_Z );
1447         tmp3 = emit_fetch( bld, inst, 0, CHAN_Z );
1448      }
1449      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1450          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
1451         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
1452         tmp4 = emit_fetch( bld, inst, 1, CHAN_Y );
1453      }
1454      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
1455         tmp2 = tmp0;
1456         tmp2 = lp_build_mul( &bld->base, tmp2, tmp1);
1457         tmp5 = tmp3;
1458         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
1459         tmp2 = lp_build_sub( &bld->base, tmp2, tmp5);
1460         dst0[CHAN_X] = tmp2;
1461      }
1462      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
1463          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
1464         tmp2 = emit_fetch( bld, inst, 1, CHAN_X );
1465         tmp5 = emit_fetch( bld, inst, 0, CHAN_X );
1466      }
1467      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
1468         tmp3 = lp_build_mul( &bld->base, tmp3, tmp2);
1469         tmp1 = lp_build_mul( &bld->base, tmp1, tmp5);
1470         tmp3 = lp_build_sub( &bld->base, tmp3, tmp1);
1471         dst0[CHAN_Y] = tmp3;
1472      }
1473      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
1474         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
1475         tmp0 = lp_build_mul( &bld->base, tmp0, tmp2);
1476         tmp5 = lp_build_sub( &bld->base, tmp5, tmp0);
1477         dst0[CHAN_Z] = tmp5;
1478      }
1479      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
1480         dst0[CHAN_W] = bld->base.one;
1481      }
1482      break;
1483
1484   case TGSI_OPCODE_ABS:
1485      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1486         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1487         dst0[chan_index] = lp_build_abs( &bld->base, tmp0 );
1488      }
1489      break;
1490
1491   case TGSI_OPCODE_RCC:
1492      /* deprecated? */
1493      assert(0);
1494      return FALSE;
1495
1496   case TGSI_OPCODE_DPH:
1497      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1498      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
1499      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1500      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1501      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
1502      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1503      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1504      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
1505      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
1506      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1507      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1508      tmp1 = emit_fetch( bld, inst, 1, CHAN_W );
1509      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1510      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1511         dst0[chan_index] = tmp0;
1512      }
1513      break;
1514
1515   case TGSI_OPCODE_COS:
1516      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1517      tmp0 = lp_build_cos( &bld->base, tmp0 );
1518      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1519         dst0[chan_index] = tmp0;
1520      }
1521      break;
1522
1523   case TGSI_OPCODE_DDX:
1524      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1525         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, &dst0[chan_index], NULL);
1526      }
1527      break;
1528
1529   case TGSI_OPCODE_DDY:
1530      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1531         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, NULL, &dst0[chan_index]);
1532      }
1533      break;
1534
1535   case TGSI_OPCODE_KILP:
1536      /* predicated kill */
1537      emit_kilp( bld, inst );
1538      break;
1539
1540   case TGSI_OPCODE_KIL:
1541      /* conditional kill */
1542      emit_kil( bld, inst );
1543      break;
1544
1545   case TGSI_OPCODE_PK2H:
1546      return FALSE;
1547      break;
1548
1549   case TGSI_OPCODE_PK2US:
1550      return FALSE;
1551      break;
1552
1553   case TGSI_OPCODE_PK4B:
1554      return FALSE;
1555      break;
1556
1557   case TGSI_OPCODE_PK4UB:
1558      return FALSE;
1559      break;
1560
1561   case TGSI_OPCODE_RFL:
1562      return FALSE;
1563      break;
1564
1565   case TGSI_OPCODE_SEQ:
1566      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1567         src0 = emit_fetch( bld, inst, 0, chan_index );
1568         src1 = emit_fetch( bld, inst, 1, chan_index );
1569         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_EQUAL, src0, src1 );
1570         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1571      }
1572      break;
1573
1574   case TGSI_OPCODE_SFL:
1575      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1576         dst0[chan_index] = bld->base.zero;
1577      }
1578      break;
1579
1580   case TGSI_OPCODE_SGT:
1581      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1582         src0 = emit_fetch( bld, inst, 0, chan_index );
1583         src1 = emit_fetch( bld, inst, 1, chan_index );
1584         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src0, src1 );
1585         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1586      }
1587      break;
1588
1589   case TGSI_OPCODE_SIN:
1590      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1591      tmp0 = lp_build_sin( &bld->base, tmp0 );
1592      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1593         dst0[chan_index] = tmp0;
1594      }
1595      break;
1596
1597   case TGSI_OPCODE_SLE:
1598      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1599         src0 = emit_fetch( bld, inst, 0, chan_index );
1600         src1 = emit_fetch( bld, inst, 1, chan_index );
1601         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LEQUAL, src0, src1 );
1602         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1603      }
1604      break;
1605
1606   case TGSI_OPCODE_SNE:
1607      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1608         src0 = emit_fetch( bld, inst, 0, chan_index );
1609         src1 = emit_fetch( bld, inst, 1, chan_index );
1610         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_NOTEQUAL, src0, src1 );
1611         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1612      }
1613      break;
1614
1615   case TGSI_OPCODE_STR:
1616      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1617         dst0[chan_index] = bld->base.one;
1618      }
1619      break;
1620
1621   case TGSI_OPCODE_TEX:
1622      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_NONE, dst0 );
1623      break;
1624
1625   case TGSI_OPCODE_TXD:
1626      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV, dst0 );
1627      break;
1628
1629   case TGSI_OPCODE_UP2H:
1630      /* deprecated */
1631      assert (0);
1632      return FALSE;
1633      break;
1634
1635   case TGSI_OPCODE_UP2US:
1636      /* deprecated */
1637      assert(0);
1638      return FALSE;
1639      break;
1640
1641   case TGSI_OPCODE_UP4B:
1642      /* deprecated */
1643      assert(0);
1644      return FALSE;
1645      break;
1646
1647   case TGSI_OPCODE_UP4UB:
1648      /* deprecated */
1649      assert(0);
1650      return FALSE;
1651      break;
1652
1653   case TGSI_OPCODE_X2D:
1654      /* deprecated? */
1655      assert(0);
1656      return FALSE;
1657      break;
1658
1659   case TGSI_OPCODE_ARA:
1660      /* deprecated */
1661      assert(0);
1662      return FALSE;
1663      break;
1664
1665   case TGSI_OPCODE_ARR:
1666      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1667         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1668         tmp0 = lp_build_round(&bld->base, tmp0);
1669         dst0[chan_index] = tmp0;
1670      }
1671      break;
1672
1673   case TGSI_OPCODE_BRA:
1674      /* deprecated */
1675      assert(0);
1676      return FALSE;
1677      break;
1678
1679   case TGSI_OPCODE_CAL:
1680      lp_exec_mask_call(&bld->exec_mask,
1681                        inst->Label.Label,
1682                        pc);
1683
1684      break;
1685
1686   case TGSI_OPCODE_RET:
1687      lp_exec_mask_ret(&bld->exec_mask, pc);
1688      break;
1689
1690   case TGSI_OPCODE_END:
1691      *pc = -1;
1692      break;
1693
1694   case TGSI_OPCODE_SSG:
1695   /* TGSI_OPCODE_SGN */
1696      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1697         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1698         dst0[chan_index] = lp_build_sgn( &bld->base, tmp0 );
1699      }
1700      break;
1701
1702   case TGSI_OPCODE_CMP:
1703      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1704         src0 = emit_fetch( bld, inst, 0, chan_index );
1705         src1 = emit_fetch( bld, inst, 1, chan_index );
1706         src2 = emit_fetch( bld, inst, 2, chan_index );
1707         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, bld->base.zero );
1708         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src1, src2);
1709      }
1710      break;
1711
1712   case TGSI_OPCODE_SCS:
1713      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
1714         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1715         dst0[CHAN_X] = lp_build_cos( &bld->base, tmp0 );
1716      }
1717      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
1718         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1719         dst0[CHAN_Y] = lp_build_sin( &bld->base, tmp0 );
1720      }
1721      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
1722         dst0[CHAN_Z] = bld->base.zero;
1723      }
1724      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
1725         dst0[CHAN_W] = bld->base.one;
1726      }
1727      break;
1728
1729   case TGSI_OPCODE_TXB:
1730      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS, dst0 );
1731      break;
1732
1733   case TGSI_OPCODE_NRM:
1734      /* fall-through */
1735   case TGSI_OPCODE_NRM4:
1736      /* 3 or 4-component normalization */
1737      {
1738         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
1739
1740         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) ||
1741             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y) ||
1742             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z) ||
1743             (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 4)) {
1744
1745            /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
1746
1747            /* xmm4 = src.x */
1748            /* xmm0 = src.x * src.x */
1749            tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
1750            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
1751               tmp4 = tmp0;
1752            }
1753            tmp0 = lp_build_mul( &bld->base, tmp0, tmp0);
1754
1755            /* xmm5 = src.y */
1756            /* xmm0 = xmm0 + src.y * src.y */
1757            tmp1 = emit_fetch(bld, inst, 0, CHAN_Y);
1758            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
1759               tmp5 = tmp1;
1760            }
1761            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
1762            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1763
1764            /* xmm6 = src.z */
1765            /* xmm0 = xmm0 + src.z * src.z */
1766            tmp1 = emit_fetch(bld, inst, 0, CHAN_Z);
1767            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
1768               tmp6 = tmp1;
1769            }
1770            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
1771            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1772
1773            if (dims == 4) {
1774               /* xmm7 = src.w */
1775               /* xmm0 = xmm0 + src.w * src.w */
1776               tmp1 = emit_fetch(bld, inst, 0, CHAN_W);
1777               if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W)) {
1778                  tmp7 = tmp1;
1779               }
1780               tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
1781               tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1782            }
1783
1784            /* xmm1 = 1 / sqrt(xmm0) */
1785            tmp1 = lp_build_rsqrt( &bld->base, tmp0);
1786
1787            /* dst.x = xmm1 * src.x */
1788            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
1789               dst0[CHAN_X] = lp_build_mul( &bld->base, tmp4, tmp1);
1790            }
1791
1792            /* dst.y = xmm1 * src.y */
1793            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
1794               dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp5, tmp1);
1795            }
1796
1797            /* dst.z = xmm1 * src.z */
1798            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
1799               dst0[CHAN_Z] = lp_build_mul( &bld->base, tmp6, tmp1);
1800            }
1801
1802            /* dst.w = xmm1 * src.w */
1803            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) && dims == 4) {
1804               dst0[CHAN_W] = lp_build_mul( &bld->base, tmp7, tmp1);
1805            }
1806         }
1807
1808         /* dst.w = 1.0 */
1809         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 3) {
1810            dst0[CHAN_W] = bld->base.one;
1811         }
1812      }
1813      break;
1814
1815   case TGSI_OPCODE_DIV:
1816      /* deprecated */
1817      assert( 0 );
1818      return FALSE;
1819      break;
1820
1821   case TGSI_OPCODE_DP2:
1822      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
1823      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
1824      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
1825      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
1826      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
1827      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
1828      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
1829      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1830         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
1831      }
1832      break;
1833
1834   case TGSI_OPCODE_TXL:
1835      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD, dst0 );
1836      break;
1837
1838   case TGSI_OPCODE_TXP:
1839      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_PROJECTED, dst0 );
1840      break;
1841
1842   case TGSI_OPCODE_BRK:
1843      lp_exec_break(&bld->exec_mask);
1844      break;
1845
1846   case TGSI_OPCODE_IF:
1847      tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
1848      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_NOTEQUAL,
1849                          tmp0, bld->base.zero);
1850      lp_exec_mask_cond_push(&bld->exec_mask, tmp0);
1851      break;
1852
1853   case TGSI_OPCODE_BGNLOOP:
1854      lp_exec_bgnloop(&bld->exec_mask);
1855      break;
1856
1857   case TGSI_OPCODE_BGNSUB:
1858      lp_exec_mask_bgnsub(&bld->exec_mask);
1859      break;
1860
1861   case TGSI_OPCODE_ELSE:
1862      lp_exec_mask_cond_invert(&bld->exec_mask);
1863      break;
1864
1865   case TGSI_OPCODE_ENDIF:
1866      lp_exec_mask_cond_pop(&bld->exec_mask);
1867      break;
1868
1869   case TGSI_OPCODE_ENDLOOP:
1870      lp_exec_endloop(&bld->exec_mask);
1871      break;
1872
1873   case TGSI_OPCODE_ENDSUB:
1874      lp_exec_mask_endsub(&bld->exec_mask, pc);
1875      break;
1876
1877   case TGSI_OPCODE_PUSHA:
1878      /* deprecated? */
1879      assert(0);
1880      return FALSE;
1881      break;
1882
1883   case TGSI_OPCODE_POPA:
1884      /* deprecated? */
1885      assert(0);
1886      return FALSE;
1887      break;
1888
1889   case TGSI_OPCODE_CEIL:
1890      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1891         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1892         dst0[chan_index] = lp_build_ceil(&bld->base, tmp0);
1893      }
1894      break;
1895
1896   case TGSI_OPCODE_I2F:
1897      /* deprecated? */
1898      assert(0);
1899      return FALSE;
1900      break;
1901
1902   case TGSI_OPCODE_NOT:
1903      /* deprecated? */
1904      assert(0);
1905      return FALSE;
1906      break;
1907
1908   case TGSI_OPCODE_TRUNC:
1909      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1910         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1911         dst0[chan_index] = lp_build_trunc(&bld->base, tmp0);
1912      }
1913      break;
1914
1915   case TGSI_OPCODE_SHL:
1916      /* deprecated? */
1917      assert(0);
1918      return FALSE;
1919      break;
1920
1921   case TGSI_OPCODE_ISHR:
1922      /* deprecated? */
1923      assert(0);
1924      return FALSE;
1925      break;
1926
1927   case TGSI_OPCODE_AND:
1928      /* deprecated? */
1929      assert(0);
1930      return FALSE;
1931      break;
1932
1933   case TGSI_OPCODE_OR:
1934      /* deprecated? */
1935      assert(0);
1936      return FALSE;
1937      break;
1938
1939   case TGSI_OPCODE_MOD:
1940      /* deprecated? */
1941      assert(0);
1942      return FALSE;
1943      break;
1944
1945   case TGSI_OPCODE_XOR:
1946      /* deprecated? */
1947      assert(0);
1948      return FALSE;
1949      break;
1950
1951   case TGSI_OPCODE_SAD:
1952      /* deprecated? */
1953      assert(0);
1954      return FALSE;
1955      break;
1956
1957   case TGSI_OPCODE_TXF:
1958      /* deprecated? */
1959      assert(0);
1960      return FALSE;
1961      break;
1962
1963   case TGSI_OPCODE_TXQ:
1964      /* deprecated? */
1965      assert(0);
1966      return FALSE;
1967      break;
1968
1969   case TGSI_OPCODE_CONT:
1970      lp_exec_continue(&bld->exec_mask);
1971      break;
1972
1973   case TGSI_OPCODE_EMIT:
1974      return FALSE;
1975      break;
1976
1977   case TGSI_OPCODE_ENDPRIM:
1978      return FALSE;
1979      break;
1980
1981   case TGSI_OPCODE_NOP:
1982      break;
1983
1984   default:
1985      return FALSE;
1986   }
1987
1988   if(info->num_dst) {
1989      LLVMValueRef pred[NUM_CHANNELS];
1990
1991      emit_fetch_predicate( bld, inst, pred );
1992
1993      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1994         emit_store( bld, inst, 0, chan_index, pred[chan_index], dst0[chan_index]);
1995      }
1996   }
1997
1998   return TRUE;
1999}
2000
2001
2002void
2003lp_build_tgsi_soa(LLVMBuilderRef builder,
2004                  const struct tgsi_token *tokens,
2005                  struct lp_type type,
2006                  struct lp_build_mask_context *mask,
2007                  LLVMValueRef consts_ptr,
2008                  const LLVMValueRef *pos,
2009                  const LLVMValueRef (*inputs)[NUM_CHANNELS],
2010                  LLVMValueRef (*outputs)[NUM_CHANNELS],
2011                  struct lp_build_sampler_soa *sampler,
2012                  const struct tgsi_shader_info *info)
2013{
2014   struct lp_build_tgsi_soa_context bld;
2015   struct tgsi_parse_context parse;
2016   uint num_immediates = 0;
2017   uint num_instructions = 0;
2018   unsigned i;
2019   int pc = 0;
2020
2021   /* Setup build context */
2022   memset(&bld, 0, sizeof bld);
2023   lp_build_context_init(&bld.base, builder, type);
2024   lp_build_context_init(&bld.int_bld, builder, lp_int_type(type));
2025   bld.mask = mask;
2026   bld.pos = pos;
2027   bld.inputs = inputs;
2028   bld.outputs = outputs;
2029   bld.consts_ptr = consts_ptr;
2030   bld.sampler = sampler;
2031   bld.indirect_files = info->indirect_files;
2032   bld.instructions = (struct tgsi_full_instruction *)
2033                      MALLOC( LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction) );
2034   bld.max_instructions = LP_MAX_INSTRUCTIONS;
2035
2036   if (!bld.instructions) {
2037      return;
2038   }
2039
2040   lp_exec_mask_init(&bld.exec_mask, &bld.base);
2041
2042   tgsi_parse_init( &parse, tokens );
2043
2044   while( !tgsi_parse_end_of_tokens( &parse ) ) {
2045      tgsi_parse_token( &parse );
2046
2047      switch( parse.FullToken.Token.Type ) {
2048      case TGSI_TOKEN_TYPE_DECLARATION:
2049         /* Inputs already interpolated */
2050         emit_declaration( &bld, &parse.FullToken.FullDeclaration );
2051         break;
2052
2053      case TGSI_TOKEN_TYPE_INSTRUCTION:
2054         {
2055            /* save expanded instruction */
2056            if (num_instructions == bld.max_instructions) {
2057               struct tgsi_full_instruction *instructions;
2058               instructions = REALLOC(bld.instructions,
2059                                      bld.max_instructions
2060                                      * sizeof(struct tgsi_full_instruction),
2061                                      (bld.max_instructions + LP_MAX_INSTRUCTIONS)
2062                                      * sizeof(struct tgsi_full_instruction));
2063               if (!instructions) {
2064                  break;
2065               }
2066               bld.instructions = instructions;
2067               bld.max_instructions += LP_MAX_INSTRUCTIONS;
2068            }
2069
2070            memcpy(bld.instructions + num_instructions,
2071                   &parse.FullToken.FullInstruction,
2072                   sizeof(bld.instructions[0]));
2073
2074            num_instructions++;
2075         }
2076
2077         break;
2078
2079      case TGSI_TOKEN_TYPE_IMMEDIATE:
2080         /* simply copy the immediate values into the next immediates[] slot */
2081         {
2082            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2083            assert(size <= 4);
2084            assert(num_immediates < LP_MAX_TGSI_IMMEDIATES);
2085            for( i = 0; i < size; ++i )
2086               bld.immediates[num_immediates][i] =
2087                  lp_build_const_vec(type, parse.FullToken.FullImmediate.u[i].Float);
2088            for( i = size; i < 4; ++i )
2089               bld.immediates[num_immediates][i] = bld.base.undef;
2090            num_immediates++;
2091         }
2092         break;
2093
2094      case TGSI_TOKEN_TYPE_PROPERTY:
2095         break;
2096
2097      default:
2098         assert( 0 );
2099      }
2100   }
2101
2102   while (pc != -1) {
2103      struct tgsi_full_instruction *instr = bld.instructions + pc;
2104      const struct tgsi_opcode_info *opcode_info =
2105         tgsi_get_opcode_info(instr->Instruction.Opcode);
2106      if (!emit_instruction( &bld, instr, opcode_info, &pc ))
2107         _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
2108                       opcode_info->mnemonic);
2109   }
2110
2111   if (0) {
2112      LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
2113      LLVMValueRef function = LLVMGetBasicBlockParent(block);
2114      debug_printf("11111111111111111111111111111 \n");
2115      tgsi_dump(tokens, 0);
2116      lp_debug_dump_value(function);
2117      debug_printf("2222222222222222222222222222 \n");
2118   }
2119   tgsi_parse_free( &parse );
2120
2121   if (0) {
2122      LLVMModuleRef module = LLVMGetGlobalParent(
2123         LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld.base.builder)));
2124      LLVMDumpModule(module);
2125
2126   }
2127
2128   FREE( bld.instructions );
2129}
2130
2131