lp_bld_tgsi_soa.c revision 17dbd41cf23e7e7de2f27e5e9252d7f792d932f3
1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * @file
31 * TGSI to LLVM IR translation -- SoA.
32 *
33 * @author Jose Fonseca <jfonseca@vmware.com>
34 *
35 * Based on tgsi_sse2.c code written by Michal Krol, Keith Whitwell,
36 * Brian Paul, and others.
37 */
38
39#include "pipe/p_config.h"
40#include "pipe/p_shader_tokens.h"
41#include "util/u_debug.h"
42#include "util/u_math.h"
43#include "util/u_memory.h"
44#include "tgsi/tgsi_dump.h"
45#include "tgsi/tgsi_info.h"
46#include "tgsi/tgsi_parse.h"
47#include "tgsi/tgsi_util.h"
48#include "tgsi/tgsi_scan.h"
49#include "lp_bld_type.h"
50#include "lp_bld_const.h"
51#include "lp_bld_arit.h"
52#include "lp_bld_bitarit.h"
53#include "lp_bld_gather.h"
54#include "lp_bld_logic.h"
55#include "lp_bld_swizzle.h"
56#include "lp_bld_flow.h"
57#include "lp_bld_quad.h"
58#include "lp_bld_tgsi.h"
59#include "lp_bld_limits.h"
60#include "lp_bld_debug.h"
61
62
63#define FOR_EACH_CHANNEL( CHAN )\
64   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
65
66#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
67   ((INST)->Dst[0].Register.WriteMask & (1 << (CHAN)))
68
69#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
70   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
71
72#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
73   FOR_EACH_CHANNEL( CHAN )\
74      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
75
76#define CHAN_X 0
77#define CHAN_Y 1
78#define CHAN_Z 2
79#define CHAN_W 3
80#define NUM_CHANNELS 4
81
82#define LP_MAX_INSTRUCTIONS 256
83
84
85struct lp_exec_mask {
86   struct lp_build_context *bld;
87
88   boolean has_mask;
89
90   LLVMTypeRef int_vec_type;
91
92   LLVMValueRef cond_stack[LP_MAX_TGSI_NESTING];
93   int cond_stack_size;
94   LLVMValueRef cond_mask;
95
96   LLVMBasicBlockRef loop_block;
97   LLVMValueRef cont_mask;
98   LLVMValueRef break_mask;
99   LLVMValueRef break_var;
100   struct {
101      LLVMBasicBlockRef loop_block;
102      LLVMValueRef cont_mask;
103      LLVMValueRef break_mask;
104      LLVMValueRef break_var;
105   } loop_stack[LP_MAX_TGSI_NESTING];
106   int loop_stack_size;
107
108   LLVMValueRef ret_mask;
109   struct {
110      int pc;
111      LLVMValueRef ret_mask;
112   } call_stack[LP_MAX_TGSI_NESTING];
113   int call_stack_size;
114
115   LLVMValueRef exec_mask;
116};
117
118struct lp_build_tgsi_soa_context
119{
120   struct lp_build_context base;
121
122   /* Builder for integer masks and indices */
123   struct lp_build_context uint_bld;
124
125   LLVMValueRef consts_ptr;
126   const LLVMValueRef *pos;
127   const LLVMValueRef (*inputs)[NUM_CHANNELS];
128   LLVMValueRef (*outputs)[NUM_CHANNELS];
129
130   const struct lp_build_sampler_soa *sampler;
131
132   LLVMValueRef immediates[LP_MAX_TGSI_IMMEDIATES][NUM_CHANNELS];
133   LLVMValueRef temps[LP_MAX_TGSI_TEMPS][NUM_CHANNELS];
134   LLVMValueRef addr[LP_MAX_TGSI_ADDRS][NUM_CHANNELS];
135   LLVMValueRef preds[LP_MAX_TGSI_PREDS][NUM_CHANNELS];
136
137   /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
138    * set in the indirect_files field.
139    * The temps[] array above is unused then.
140    */
141   LLVMValueRef temps_array;
142
143   const struct tgsi_shader_info *info;
144   /** bitmask indicating which register files are accessed indirectly */
145   unsigned indirect_files;
146
147   struct lp_build_mask_context *mask;
148   struct lp_exec_mask exec_mask;
149
150   struct tgsi_full_instruction *instructions;
151   uint max_instructions;
152};
153
154static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
155{
156   mask->bld = bld;
157   mask->has_mask = FALSE;
158   mask->cond_stack_size = 0;
159   mask->loop_stack_size = 0;
160   mask->call_stack_size = 0;
161
162   mask->int_vec_type = lp_build_int_vec_type(mask->bld->type);
163   mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask = mask->cond_mask =
164         LLVMConstAllOnes(mask->int_vec_type);
165}
166
167static void lp_exec_mask_update(struct lp_exec_mask *mask)
168{
169   if (mask->loop_stack_size) {
170      /*for loops we need to update the entire mask at runtime */
171      LLVMValueRef tmp;
172      assert(mask->break_mask);
173      tmp = LLVMBuildAnd(mask->bld->builder,
174                         mask->cont_mask,
175                         mask->break_mask,
176                         "maskcb");
177      mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
178                                     mask->cond_mask,
179                                     tmp,
180                                     "maskfull");
181   } else
182      mask->exec_mask = mask->cond_mask;
183
184   if (mask->call_stack_size) {
185      mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
186                                     mask->exec_mask,
187                                     mask->ret_mask,
188                                     "callmask");
189   }
190
191   mask->has_mask = (mask->cond_stack_size > 0 ||
192                     mask->loop_stack_size > 0 ||
193                     mask->call_stack_size > 0);
194}
195
196static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
197                                   LLVMValueRef val)
198{
199   assert(mask->cond_stack_size < LP_MAX_TGSI_NESTING);
200   if (mask->cond_stack_size == 0) {
201      assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type));
202   }
203   mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
204   assert(LLVMTypeOf(val) == mask->int_vec_type);
205   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
206                                  mask->cond_mask,
207                                  val,
208                                  "");
209   lp_exec_mask_update(mask);
210}
211
212static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
213{
214   LLVMValueRef prev_mask;
215   LLVMValueRef inv_mask;
216
217   assert(mask->cond_stack_size);
218   prev_mask = mask->cond_stack[mask->cond_stack_size - 1];
219   if (mask->cond_stack_size == 1) {
220      assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type));
221   }
222
223   inv_mask = LLVMBuildNot(mask->bld->builder, mask->cond_mask, "");
224
225   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
226                                  inv_mask,
227                                  prev_mask, "");
228   lp_exec_mask_update(mask);
229}
230
231static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
232{
233   assert(mask->cond_stack_size);
234   mask->cond_mask = mask->cond_stack[--mask->cond_stack_size];
235   lp_exec_mask_update(mask);
236}
237
238static void lp_exec_bgnloop(struct lp_exec_mask *mask)
239{
240   if (mask->loop_stack_size == 0) {
241      assert(mask->loop_block == NULL);
242      assert(mask->cont_mask == LLVMConstAllOnes(mask->int_vec_type));
243      assert(mask->break_mask == LLVMConstAllOnes(mask->int_vec_type));
244      assert(mask->break_var == NULL);
245   }
246
247   assert(mask->loop_stack_size < LP_MAX_TGSI_NESTING);
248
249   mask->loop_stack[mask->loop_stack_size].loop_block = mask->loop_block;
250   mask->loop_stack[mask->loop_stack_size].cont_mask = mask->cont_mask;
251   mask->loop_stack[mask->loop_stack_size].break_mask = mask->break_mask;
252   mask->loop_stack[mask->loop_stack_size].break_var = mask->break_var;
253   ++mask->loop_stack_size;
254
255   mask->break_var = lp_build_alloca(mask->bld->builder, mask->int_vec_type, "");
256   LLVMBuildStore(mask->bld->builder, mask->break_mask, mask->break_var);
257
258   mask->loop_block = lp_build_insert_new_block(mask->bld->builder, "bgnloop");
259   LLVMBuildBr(mask->bld->builder, mask->loop_block);
260   LLVMPositionBuilderAtEnd(mask->bld->builder, mask->loop_block);
261
262   mask->break_mask = LLVMBuildLoad(mask->bld->builder, mask->break_var, "");
263
264   lp_exec_mask_update(mask);
265}
266
267static void lp_exec_break(struct lp_exec_mask *mask)
268{
269   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
270                                         mask->exec_mask,
271                                         "break");
272
273   mask->break_mask = LLVMBuildAnd(mask->bld->builder,
274                                   mask->break_mask,
275                                   exec_mask, "break_full");
276
277   lp_exec_mask_update(mask);
278}
279
280static void lp_exec_continue(struct lp_exec_mask *mask)
281{
282   LLVMValueRef exec_mask = LLVMBuildNot(mask->bld->builder,
283                                         mask->exec_mask,
284                                         "");
285
286   mask->cont_mask = LLVMBuildAnd(mask->bld->builder,
287                                  mask->cont_mask,
288                                  exec_mask, "");
289
290   lp_exec_mask_update(mask);
291}
292
293
294static void lp_exec_endloop(struct lp_exec_mask *mask)
295{
296   LLVMBasicBlockRef endloop;
297   LLVMTypeRef reg_type = LLVMIntType(mask->bld->type.width*
298                                      mask->bld->type.length);
299   LLVMValueRef i1cond;
300
301   assert(mask->break_mask);
302
303   /*
304    * Restore the cont_mask, but don't pop
305    */
306   assert(mask->loop_stack_size);
307   mask->cont_mask = mask->loop_stack[mask->loop_stack_size - 1].cont_mask;
308   lp_exec_mask_update(mask);
309
310   /*
311    * Unlike the continue mask, the break_mask must be preserved across loop
312    * iterations
313    */
314   LLVMBuildStore(mask->bld->builder, mask->break_mask, mask->break_var);
315
316   /* i1cond = (mask == 0) */
317   i1cond = LLVMBuildICmp(
318      mask->bld->builder,
319      LLVMIntNE,
320      LLVMBuildBitCast(mask->bld->builder, mask->exec_mask, reg_type, ""),
321      LLVMConstNull(reg_type), "");
322
323   endloop = lp_build_insert_new_block(mask->bld->builder, "endloop");
324
325   LLVMBuildCondBr(mask->bld->builder,
326                   i1cond, mask->loop_block, endloop);
327
328   LLVMPositionBuilderAtEnd(mask->bld->builder, endloop);
329
330   assert(mask->loop_stack_size);
331   --mask->loop_stack_size;
332   mask->loop_block = mask->loop_stack[mask->loop_stack_size].loop_block;
333   mask->cont_mask = mask->loop_stack[mask->loop_stack_size].cont_mask;
334   mask->break_mask = mask->loop_stack[mask->loop_stack_size].break_mask;
335   mask->break_var = mask->loop_stack[mask->loop_stack_size].break_var;
336
337   lp_exec_mask_update(mask);
338}
339
340/* stores val into an address pointed to by dst.
341 * mask->exec_mask is used to figure out which bits of val
342 * should be stored into the address
343 * (0 means don't store this bit, 1 means do store).
344 */
345static void lp_exec_mask_store(struct lp_exec_mask *mask,
346                               LLVMValueRef pred,
347                               LLVMValueRef val,
348                               LLVMValueRef dst)
349{
350   /* Mix the predicate and execution mask */
351   if (mask->has_mask) {
352      if (pred) {
353         pred = LLVMBuildAnd(mask->bld->builder, pred, mask->exec_mask, "");
354      } else {
355         pred = mask->exec_mask;
356      }
357   }
358
359   if (pred) {
360      LLVMValueRef real_val, dst_val;
361
362      dst_val = LLVMBuildLoad(mask->bld->builder, dst, "");
363      real_val = lp_build_select(mask->bld,
364                                 pred,
365                                 val, dst_val);
366
367      LLVMBuildStore(mask->bld->builder, real_val, dst);
368   } else
369      LLVMBuildStore(mask->bld->builder, val, dst);
370}
371
372static void lp_exec_mask_call(struct lp_exec_mask *mask,
373                              int func,
374                              int *pc)
375{
376   assert(mask->call_stack_size < LP_MAX_TGSI_NESTING);
377   mask->call_stack[mask->call_stack_size].pc = *pc;
378   mask->call_stack[mask->call_stack_size].ret_mask = mask->ret_mask;
379   mask->call_stack_size++;
380   *pc = func;
381}
382
383static void lp_exec_mask_ret(struct lp_exec_mask *mask, int *pc)
384{
385   LLVMValueRef exec_mask;
386
387   if (mask->call_stack_size == 0) {
388      /* returning from main() */
389      *pc = -1;
390      return;
391   }
392   exec_mask = LLVMBuildNot(mask->bld->builder,
393                            mask->exec_mask,
394                            "ret");
395
396   mask->ret_mask = LLVMBuildAnd(mask->bld->builder,
397                                 mask->ret_mask,
398                                 exec_mask, "ret_full");
399
400   lp_exec_mask_update(mask);
401}
402
403static void lp_exec_mask_bgnsub(struct lp_exec_mask *mask)
404{
405}
406
407static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
408{
409   assert(mask->call_stack_size);
410   mask->call_stack_size--;
411   *pc = mask->call_stack[mask->call_stack_size].pc;
412   mask->ret_mask = mask->call_stack[mask->call_stack_size].ret_mask;
413   lp_exec_mask_update(mask);
414}
415
416
417/**
418 * Return pointer to a temporary register channel (src or dest).
419 * Note that indirect addressing cannot be handled here.
420 * \param index  which temporary register
421 * \param chan  which channel of the temp register.
422 */
423static LLVMValueRef
424get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
425             unsigned index,
426             unsigned chan)
427{
428   assert(chan < 4);
429   if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
430      LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan);
431      return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, "");
432   }
433   else {
434      return bld->temps[index][chan];
435   }
436}
437
438
439/**
440 * Gather vector.
441 * XXX the lp_build_gather() function should be capable of doing this
442 * with a little work.
443 */
444static LLVMValueRef
445build_gather(struct lp_build_tgsi_soa_context *bld,
446             LLVMValueRef base_ptr,
447             LLVMValueRef indexes)
448{
449   LLVMValueRef res = bld->base.undef;
450   unsigned i;
451
452   /*
453    * Loop over elements of index_vec, load scalar value, insert it into 'res'.
454    */
455   for (i = 0; i < bld->base.type.length; i++) {
456      LLVMValueRef ii = LLVMConstInt(LLVMInt32Type(), i, 0);
457      LLVMValueRef index = LLVMBuildExtractElement(bld->base.builder,
458                                                   indexes, ii, "");
459      LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, base_ptr,
460                                             &index, 1, "");
461      LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
462
463      res = LLVMBuildInsertElement(bld->base.builder, res, scalar, ii, "");
464   }
465
466   return res;
467}
468
469
470/**
471 * Read the current value of the ADDR register, convert the floats to
472 * ints, multiply by four and return the vector of offsets.
473 * The offsets will be used to index into the constant buffer or
474 * temporary register file.
475 */
476static LLVMValueRef
477get_indirect_index(struct lp_build_tgsi_soa_context *bld,
478                   unsigned reg_file, unsigned reg_index,
479                   const struct tgsi_src_register *indirect_reg)
480{
481   struct lp_build_context *uint_bld = &bld->uint_bld;
482   /* always use X component of address register */
483   unsigned swizzle = indirect_reg->SwizzleX;
484   LLVMValueRef base;
485   LLVMValueRef rel;
486   LLVMValueRef max_index;
487   LLVMValueRef index;
488
489   assert(bld->indirect_files & (1 << reg_file));
490
491   base = lp_build_const_int_vec(uint_bld->type, reg_index);
492
493   assert(swizzle < 4);
494   rel = LLVMBuildLoad(bld->base.builder,
495                        bld->addr[indirect_reg->Index][swizzle],
496                        "load addr reg");
497
498   /* for indexing we want integers */
499   rel = LLVMBuildFPToSI(bld->base.builder,
500                         rel,
501                         uint_bld->vec_type, "");
502
503   index = lp_build_add(uint_bld, base, rel);
504
505   max_index = lp_build_const_int_vec(uint_bld->type,
506                                      bld->info->file_max[reg_file]);
507
508   assert(!uint_bld->type.sign);
509   index = lp_build_min(uint_bld, index, max_index);
510
511   return index;
512}
513
514
515/**
516 * Register fetch.
517 */
518static LLVMValueRef
519emit_fetch(
520   struct lp_build_tgsi_soa_context *bld,
521   const struct tgsi_full_instruction *inst,
522   unsigned src_op,
523   const unsigned chan_index )
524{
525   struct lp_build_context *uint_bld = &bld->uint_bld;
526   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
527   const unsigned swizzle =
528      tgsi_util_get_full_src_register_swizzle(reg, chan_index);
529   LLVMValueRef res;
530   LLVMValueRef indirect_index = NULL;
531
532   if (swizzle > 3) {
533      assert(0 && "invalid swizzle in emit_fetch()");
534      return bld->base.undef;
535   }
536
537   if (reg->Register.Indirect) {
538      indirect_index = get_indirect_index(bld,
539                                          reg->Register.File,
540                                          reg->Register.Index,
541                                          &reg->Indirect);
542   } else {
543      assert(reg->Register.Index <= bld->info->file_max[reg->Register.File]);
544   }
545
546   switch (reg->Register.File) {
547   case TGSI_FILE_CONSTANT:
548      if (reg->Register.Indirect) {
549         LLVMValueRef swizzle_vec =
550            lp_build_const_int_vec(uint_bld->type, swizzle);
551         LLVMValueRef index_vec;  /* index into the const buffer */
552
553         /* index_vec = indirect_index * 4 + swizzle */
554         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
555         index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
556
557         /* Gather values from the constant buffer */
558         res = build_gather(bld, bld->consts_ptr, index_vec);
559      }
560      else {
561         LLVMValueRef index;  /* index into the const buffer */
562         LLVMValueRef scalar, scalar_ptr;
563
564         index = lp_build_const_int32(reg->Register.Index*4 + swizzle);
565
566         scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr,
567                                   &index, 1, "");
568         scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
569
570         res = lp_build_broadcast_scalar(&bld->base, scalar);
571      }
572      break;
573
574   case TGSI_FILE_IMMEDIATE:
575      res = bld->immediates[reg->Register.Index][swizzle];
576      assert(res);
577      break;
578
579   case TGSI_FILE_INPUT:
580      res = bld->inputs[reg->Register.Index][swizzle];
581      assert(res);
582      break;
583
584   case TGSI_FILE_TEMPORARY:
585      if (reg->Register.Indirect) {
586         LLVMValueRef swizzle_vec =
587            lp_build_const_int_vec(uint_bld->type, swizzle);
588         LLVMValueRef length_vec =
589            lp_build_const_int_vec(uint_bld->type, bld->base.type.length);
590         LLVMValueRef index_vec;  /* index into the const buffer */
591         LLVMValueRef temps_array;
592         LLVMTypeRef float4_ptr_type;
593
594         /* index_vec = (indirect_index * 4 + swizzle) * length */
595         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
596         index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
597         index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
598
599         /* cast temps_array pointer to float* */
600         float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
601         temps_array = LLVMBuildBitCast(uint_bld->builder, bld->temps_array,
602                                        float4_ptr_type, "");
603
604         /* Gather values from the temporary register array */
605         res = build_gather(bld, temps_array, index_vec);
606      }
607      else {
608         LLVMValueRef temp_ptr;
609         temp_ptr = get_temp_ptr(bld, reg->Register.Index, swizzle);
610         res = LLVMBuildLoad(bld->base.builder, temp_ptr, "");
611         if (!res)
612            return bld->base.undef;
613      }
614      break;
615
616   default:
617      assert(0 && "invalid src register in emit_fetch()");
618      return bld->base.undef;
619   }
620
621   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
622   case TGSI_UTIL_SIGN_CLEAR:
623      res = lp_build_abs( &bld->base, res );
624      break;
625
626   case TGSI_UTIL_SIGN_SET:
627      res = lp_build_abs( &bld->base, res );
628      /* fall through */
629   case TGSI_UTIL_SIGN_TOGGLE:
630      res = lp_build_negate( &bld->base, res );
631      break;
632
633   case TGSI_UTIL_SIGN_KEEP:
634      break;
635   }
636
637   return res;
638}
639
640
641/**
642 * Register fetch with derivatives.
643 */
644static void
645emit_fetch_deriv(
646   struct lp_build_tgsi_soa_context *bld,
647   const struct tgsi_full_instruction *inst,
648   unsigned index,
649   const unsigned chan_index,
650   LLVMValueRef *res,
651   LLVMValueRef *ddx,
652   LLVMValueRef *ddy)
653{
654   LLVMValueRef src;
655
656   src = emit_fetch(bld, inst, index, chan_index);
657
658   if(res)
659      *res = src;
660
661   /* TODO: use interpolation coeffs for inputs */
662
663   if(ddx)
664      *ddx = lp_build_ddx(&bld->base, src);
665
666   if(ddy)
667      *ddy = lp_build_ddy(&bld->base, src);
668}
669
670
671/**
672 * Predicate.
673 */
674static void
675emit_fetch_predicate(
676   struct lp_build_tgsi_soa_context *bld,
677   const struct tgsi_full_instruction *inst,
678   LLVMValueRef *pred)
679{
680   unsigned index;
681   unsigned char swizzles[4];
682   LLVMValueRef unswizzled[4] = {NULL, NULL, NULL, NULL};
683   LLVMValueRef value;
684   unsigned chan;
685
686   if (!inst->Instruction.Predicate) {
687      FOR_EACH_CHANNEL( chan ) {
688         pred[chan] = NULL;
689      }
690      return;
691   }
692
693   swizzles[0] = inst->Predicate.SwizzleX;
694   swizzles[1] = inst->Predicate.SwizzleY;
695   swizzles[2] = inst->Predicate.SwizzleZ;
696   swizzles[3] = inst->Predicate.SwizzleW;
697
698   index = inst->Predicate.Index;
699   assert(index < LP_MAX_TGSI_PREDS);
700
701   FOR_EACH_CHANNEL( chan ) {
702      unsigned swizzle = swizzles[chan];
703
704      /*
705       * Only fetch the predicate register channels that are actually listed
706       * in the swizzles
707       */
708      if (!unswizzled[swizzle]) {
709         value = LLVMBuildLoad(bld->base.builder,
710                               bld->preds[index][swizzle], "");
711
712         /*
713          * Convert the value to an integer mask.
714          *
715          * TODO: Short-circuit this comparison -- a D3D setp_xx instructions
716          * is needlessly causing two comparisons due to storing the intermediate
717          * result as float vector instead of an integer mask vector.
718          */
719         value = lp_build_compare(bld->base.builder,
720                                  bld->base.type,
721                                  PIPE_FUNC_NOTEQUAL,
722                                  value,
723                                  bld->base.zero);
724         if (inst->Predicate.Negate) {
725            value = LLVMBuildNot(bld->base.builder, value, "");
726         }
727
728         unswizzled[swizzle] = value;
729      } else {
730         value = unswizzled[swizzle];
731      }
732
733      pred[chan] = value;
734   }
735}
736
737
738/**
739 * Register store.
740 */
741static void
742emit_store(
743   struct lp_build_tgsi_soa_context *bld,
744   const struct tgsi_full_instruction *inst,
745   unsigned index,
746   unsigned chan_index,
747   LLVMValueRef pred,
748   LLVMValueRef value)
749{
750   const struct tgsi_full_dst_register *reg = &inst->Dst[index];
751   LLVMValueRef indirect_index = NULL;
752
753   switch( inst->Instruction.Saturate ) {
754   case TGSI_SAT_NONE:
755      break;
756
757   case TGSI_SAT_ZERO_ONE:
758      value = lp_build_max(&bld->base, value, bld->base.zero);
759      value = lp_build_min(&bld->base, value, bld->base.one);
760      break;
761
762   case TGSI_SAT_MINUS_PLUS_ONE:
763      value = lp_build_max(&bld->base, value, lp_build_const_vec(bld->base.type, -1.0));
764      value = lp_build_min(&bld->base, value, bld->base.one);
765      break;
766
767   default:
768      assert(0);
769   }
770
771   if (reg->Register.Indirect) {
772      indirect_index = get_indirect_index(bld,
773                                          reg->Register.File,
774                                          reg->Register.Index,
775                                          &reg->Indirect);
776   } else {
777      assert(reg->Register.Index <= bld->info->file_max[reg->Register.File]);
778   }
779
780   switch( reg->Register.File ) {
781   case TGSI_FILE_OUTPUT:
782      lp_exec_mask_store(&bld->exec_mask, pred, value,
783                         bld->outputs[reg->Register.Index][chan_index]);
784      break;
785
786   case TGSI_FILE_TEMPORARY:
787      if (reg->Register.Indirect) {
788         /* XXX not done yet */
789         debug_printf("WARNING: LLVM scatter store of temp regs"
790                      " not implemented\n");
791      }
792      else {
793         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
794                                              chan_index);
795         lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr);
796      }
797      break;
798
799   case TGSI_FILE_ADDRESS:
800      lp_exec_mask_store(&bld->exec_mask, pred, value,
801                         bld->addr[reg->Indirect.Index][chan_index]);
802      break;
803
804   case TGSI_FILE_PREDICATE:
805      lp_exec_mask_store(&bld->exec_mask, pred, value,
806                         bld->preds[reg->Register.Index][chan_index]);
807      break;
808
809   default:
810      assert( 0 );
811   }
812}
813
814
815/**
816 * High-level instruction translators.
817 */
818
819static void
820emit_tex( struct lp_build_tgsi_soa_context *bld,
821          const struct tgsi_full_instruction *inst,
822          enum lp_build_tex_modifier modifier,
823          LLVMValueRef *texel)
824{
825   unsigned unit;
826   LLVMValueRef lod_bias, explicit_lod;
827   LLVMValueRef oow = NULL;
828   LLVMValueRef coords[3];
829   LLVMValueRef ddx[3];
830   LLVMValueRef ddy[3];
831   unsigned num_coords;
832   unsigned i;
833
834   if (!bld->sampler) {
835      _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
836      for (i = 0; i < 4; i++) {
837         texel[i] = bld->base.undef;
838      }
839      return;
840   }
841
842   switch (inst->Texture.Texture) {
843   case TGSI_TEXTURE_1D:
844      num_coords = 1;
845      break;
846   case TGSI_TEXTURE_2D:
847   case TGSI_TEXTURE_RECT:
848      num_coords = 2;
849      break;
850   case TGSI_TEXTURE_SHADOW1D:
851   case TGSI_TEXTURE_SHADOW2D:
852   case TGSI_TEXTURE_SHADOWRECT:
853   case TGSI_TEXTURE_3D:
854   case TGSI_TEXTURE_CUBE:
855      num_coords = 3;
856      break;
857   default:
858      assert(0);
859      return;
860   }
861
862   if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
863      lod_bias = emit_fetch( bld, inst, 0, 3 );
864      explicit_lod = NULL;
865   }
866   else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
867      lod_bias = NULL;
868      explicit_lod = emit_fetch( bld, inst, 0, 3 );
869   }
870   else {
871      lod_bias = NULL;
872      explicit_lod = NULL;
873   }
874
875   if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) {
876      oow = emit_fetch( bld, inst, 0, 3 );
877      oow = lp_build_rcp(&bld->base, oow);
878   }
879
880   for (i = 0; i < num_coords; i++) {
881      coords[i] = emit_fetch( bld, inst, 0, i );
882      if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
883         coords[i] = lp_build_mul(&bld->base, coords[i], oow);
884   }
885   for (i = num_coords; i < 3; i++) {
886      coords[i] = bld->base.undef;
887   }
888
889   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
890      LLVMTypeRef i32t = LLVMInt32Type();
891      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
892      for (i = 0; i < num_coords; i++) {
893         LLVMValueRef src1 = emit_fetch( bld, inst, 1, i );
894         LLVMValueRef src2 = emit_fetch( bld, inst, 2, i );
895         ddx[i] = LLVMBuildExtractElement(bld->base.builder, src1, index0, "");
896         ddy[i] = LLVMBuildExtractElement(bld->base.builder, src2, index0, "");
897      }
898      unit = inst->Src[3].Register.Index;
899   }  else {
900      for (i = 0; i < num_coords; i++) {
901         ddx[i] = lp_build_scalar_ddx( &bld->base, coords[i] );
902         ddy[i] = lp_build_scalar_ddy( &bld->base, coords[i] );
903      }
904      unit = inst->Src[1].Register.Index;
905   }
906   for (i = num_coords; i < 3; i++) {
907      ddx[i] = LLVMGetUndef(bld->base.elem_type);
908      ddy[i] = LLVMGetUndef(bld->base.elem_type);
909   }
910
911   bld->sampler->emit_fetch_texel(bld->sampler,
912                                  bld->base.builder,
913                                  bld->base.type,
914                                  unit, num_coords, coords,
915                                  ddx, ddy,
916                                  lod_bias, explicit_lod,
917                                  texel);
918}
919
920
921/**
922 * Kill fragment if any of the src register values are negative.
923 */
924static void
925emit_kil(
926   struct lp_build_tgsi_soa_context *bld,
927   const struct tgsi_full_instruction *inst )
928{
929   const struct tgsi_full_src_register *reg = &inst->Src[0];
930   LLVMValueRef terms[NUM_CHANNELS];
931   LLVMValueRef mask;
932   unsigned chan_index;
933
934   memset(&terms, 0, sizeof terms);
935
936   FOR_EACH_CHANNEL( chan_index ) {
937      unsigned swizzle;
938
939      /* Unswizzle channel */
940      swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
941
942      /* Check if the component has not been already tested. */
943      assert(swizzle < NUM_CHANNELS);
944      if( !terms[swizzle] )
945         /* TODO: change the comparison operator instead of setting the sign */
946         terms[swizzle] =  emit_fetch(bld, inst, 0, chan_index );
947   }
948
949   mask = NULL;
950   FOR_EACH_CHANNEL( chan_index ) {
951      if(terms[chan_index]) {
952         LLVMValueRef chan_mask;
953
954         /*
955          * If term < 0 then mask = 0 else mask = ~0.
956          */
957         chan_mask = lp_build_cmp(&bld->base, PIPE_FUNC_GEQUAL, terms[chan_index], bld->base.zero);
958
959         if(mask)
960            mask = LLVMBuildAnd(bld->base.builder, mask, chan_mask, "");
961         else
962            mask = chan_mask;
963      }
964   }
965
966   if(mask) {
967      lp_build_mask_update(bld->mask, mask);
968
969      /* XXX: figure out if we are at the end of the shader and skip this:
970       */
971      lp_build_mask_check(bld->mask);
972   }
973}
974
975
976/**
977 * Predicated fragment kill.
978 * XXX Actually, we do an unconditional kill (as in tgsi_exec.c).
979 * The only predication is the execution mask which will apply if
980 * we're inside a loop or conditional.
981 */
982static void
983emit_kilp(struct lp_build_tgsi_soa_context *bld,
984          const struct tgsi_full_instruction *inst)
985{
986   LLVMValueRef mask;
987
988   /* For those channels which are "alive", disable fragment shader
989    * execution.
990    */
991   if (bld->exec_mask.has_mask) {
992      mask = LLVMBuildNot(bld->base.builder, bld->exec_mask.exec_mask, "kilp");
993   }
994   else {
995      mask = bld->base.zero;
996   }
997
998   lp_build_mask_update(bld->mask, mask);
999
1000   /* XXX: figure out if we are at the end of the shader and skip this:
1001    */
1002   lp_build_mask_check(bld->mask);
1003}
1004
1005static void
1006emit_declaration(
1007   struct lp_build_tgsi_soa_context *bld,
1008   const struct tgsi_full_declaration *decl)
1009{
1010   LLVMTypeRef vec_type = bld->base.vec_type;
1011
1012   unsigned first = decl->Range.First;
1013   unsigned last = decl->Range.Last;
1014   unsigned idx, i;
1015
1016   for (idx = first; idx <= last; ++idx) {
1017      assert(last <= bld->info->file_max[decl->Declaration.File]);
1018      switch (decl->Declaration.File) {
1019      case TGSI_FILE_TEMPORARY:
1020         assert(idx < LP_MAX_TGSI_TEMPS);
1021         if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1022            LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
1023                                                   last*4 + 4, 0);
1024            bld->temps_array = lp_build_array_alloca(bld->base.builder,
1025                                                     vec_type, array_size, "");
1026         } else {
1027            for (i = 0; i < NUM_CHANNELS; i++)
1028               bld->temps[idx][i] = lp_build_alloca(bld->base.builder,
1029                                                    vec_type, "");
1030         }
1031         break;
1032
1033      case TGSI_FILE_OUTPUT:
1034         for (i = 0; i < NUM_CHANNELS; i++)
1035            bld->outputs[idx][i] = lp_build_alloca(bld->base.builder,
1036                                                   vec_type, "");
1037         break;
1038
1039      case TGSI_FILE_ADDRESS:
1040         assert(idx < LP_MAX_TGSI_ADDRS);
1041         for (i = 0; i < NUM_CHANNELS; i++)
1042            bld->addr[idx][i] = lp_build_alloca(bld->base.builder,
1043                                                vec_type, "");
1044         break;
1045
1046      case TGSI_FILE_PREDICATE:
1047         assert(idx < LP_MAX_TGSI_PREDS);
1048         for (i = 0; i < NUM_CHANNELS; i++)
1049            bld->preds[idx][i] = lp_build_alloca(bld->base.builder,
1050                                                 vec_type, "");
1051         break;
1052
1053      default:
1054         /* don't need to declare other vars */
1055         break;
1056      }
1057   }
1058}
1059
1060
1061/**
1062 * Emit LLVM for one TGSI instruction.
1063 * \param return TRUE for success, FALSE otherwise
1064 */
1065static boolean
1066emit_instruction(
1067   struct lp_build_tgsi_soa_context *bld,
1068   const struct tgsi_full_instruction *inst,
1069   const struct tgsi_opcode_info *info,
1070   int *pc)
1071{
1072   unsigned chan_index;
1073   LLVMValueRef src0, src1, src2;
1074   LLVMValueRef tmp0, tmp1, tmp2;
1075   LLVMValueRef tmp3 = NULL;
1076   LLVMValueRef tmp4 = NULL;
1077   LLVMValueRef tmp5 = NULL;
1078   LLVMValueRef tmp6 = NULL;
1079   LLVMValueRef tmp7 = NULL;
1080   LLVMValueRef res;
1081   LLVMValueRef dst0[NUM_CHANNELS];
1082
1083   /*
1084    * Stores and write masks are handled in a general fashion after the long
1085    * instruction opcode switch statement.
1086    *
1087    * Although not stricitly necessary, we avoid generating instructions for
1088    * channels which won't be stored, in cases where's that easy. For some
1089    * complex instructions, like texture sampling, it is more convenient to
1090    * assume a full writemask and then let LLVM optimization passes eliminate
1091    * redundant code.
1092    */
1093
1094   (*pc)++;
1095
1096   assert(info->num_dst <= 1);
1097   if (info->num_dst) {
1098      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1099         dst0[chan_index] = bld->base.undef;
1100      }
1101   }
1102
1103   switch (inst->Instruction.Opcode) {
1104   case TGSI_OPCODE_ARL:
1105      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1106         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1107         tmp0 = lp_build_floor(&bld->base, tmp0);
1108         dst0[chan_index] = tmp0;
1109      }
1110      break;
1111
1112   case TGSI_OPCODE_MOV:
1113      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1114         dst0[chan_index] = emit_fetch( bld, inst, 0, chan_index );
1115      }
1116      break;
1117
1118   case TGSI_OPCODE_LIT:
1119      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ) {
1120         dst0[CHAN_X] = bld->base.one;
1121      }
1122      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
1123         src0 = emit_fetch( bld, inst, 0, CHAN_X );
1124         dst0[CHAN_Y] = lp_build_max( &bld->base, src0, bld->base.zero);
1125      }
1126      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
1127         /* XMM[1] = SrcReg[0].yyyy */
1128         tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1129         /* XMM[1] = max(XMM[1], 0) */
1130         tmp1 = lp_build_max( &bld->base, tmp1, bld->base.zero);
1131         /* XMM[2] = SrcReg[0].wwww */
1132         tmp2 = emit_fetch( bld, inst, 0, CHAN_W );
1133         tmp1 = lp_build_pow( &bld->base, tmp1, tmp2);
1134         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1135         tmp2 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, tmp0, bld->base.zero);
1136         dst0[CHAN_Z] = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
1137      }
1138      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) ) {
1139         dst0[CHAN_W] = bld->base.one;
1140      }
1141      break;
1142
1143   case TGSI_OPCODE_RCP:
1144   /* TGSI_OPCODE_RECIP */
1145      src0 = emit_fetch( bld, inst, 0, CHAN_X );
1146      res = lp_build_rcp(&bld->base, src0);
1147      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1148         dst0[chan_index] = res;
1149      }
1150      break;
1151
1152   case TGSI_OPCODE_RSQ:
1153   /* TGSI_OPCODE_RECIPSQRT */
1154      src0 = emit_fetch( bld, inst, 0, CHAN_X );
1155      src0 = lp_build_abs(&bld->base, src0);
1156      res = lp_build_rsqrt(&bld->base, src0);
1157      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1158         dst0[chan_index] = res;
1159      }
1160      break;
1161
1162   case TGSI_OPCODE_EXP:
1163      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1164          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
1165          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
1166         LLVMValueRef *p_exp2_int_part = NULL;
1167         LLVMValueRef *p_frac_part = NULL;
1168         LLVMValueRef *p_exp2 = NULL;
1169
1170         src0 = emit_fetch( bld, inst, 0, CHAN_X );
1171
1172         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1173            p_exp2_int_part = &tmp0;
1174         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
1175            p_frac_part = &tmp1;
1176         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1177            p_exp2 = &tmp2;
1178
1179         lp_build_exp2_approx(&bld->base, src0, p_exp2_int_part, p_frac_part, p_exp2);
1180
1181         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1182            dst0[CHAN_X] = tmp0;
1183         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
1184            dst0[CHAN_Y] = tmp1;
1185         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1186            dst0[CHAN_Z] = tmp2;
1187      }
1188      /* dst.w = 1.0 */
1189      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
1190         dst0[CHAN_W] = bld->base.one;
1191      }
1192      break;
1193
1194   case TGSI_OPCODE_LOG:
1195      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1196          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
1197          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
1198         LLVMValueRef *p_floor_log2 = NULL;
1199         LLVMValueRef *p_exp = NULL;
1200         LLVMValueRef *p_log2 = NULL;
1201
1202         src0 = emit_fetch( bld, inst, 0, CHAN_X );
1203         src0 = lp_build_abs( &bld->base, src0 );
1204
1205         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1206            p_floor_log2 = &tmp0;
1207         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
1208            p_exp = &tmp1;
1209         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1210            p_log2 = &tmp2;
1211
1212         lp_build_log2_approx(&bld->base, src0, p_exp, p_floor_log2, p_log2);
1213
1214         /* dst.x = floor(lg2(abs(src.x))) */
1215         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
1216            dst0[CHAN_X] = tmp0;
1217         /* dst.y = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1218         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y )) {
1219            dst0[CHAN_Y] = lp_build_div( &bld->base, src0, tmp1);
1220         }
1221         /* dst.z = lg2(abs(src.x)) */
1222         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
1223            dst0[CHAN_Z] = tmp2;
1224      }
1225      /* dst.w = 1.0 */
1226      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
1227         dst0[CHAN_W] = bld->base.one;
1228      }
1229      break;
1230
1231   case TGSI_OPCODE_MUL:
1232      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1233         src0 = emit_fetch( bld, inst, 0, chan_index );
1234         src1 = emit_fetch( bld, inst, 1, chan_index );
1235         dst0[chan_index] = lp_build_mul(&bld->base, src0, src1);
1236      }
1237      break;
1238
1239   case TGSI_OPCODE_ADD:
1240      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1241         src0 = emit_fetch( bld, inst, 0, chan_index );
1242         src1 = emit_fetch( bld, inst, 1, chan_index );
1243         dst0[chan_index] = lp_build_add(&bld->base, src0, src1);
1244      }
1245      break;
1246
1247   case TGSI_OPCODE_DP3:
1248   /* TGSI_OPCODE_DOT3 */
1249      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1250      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
1251      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1252      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1253      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
1254      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1255      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1256      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
1257      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
1258      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1259      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1260      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1261         dst0[chan_index] = tmp0;
1262      }
1263      break;
1264
1265   case TGSI_OPCODE_DP4:
1266   /* TGSI_OPCODE_DOT4 */
1267      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1268      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
1269      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1270      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1271      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
1272      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1273      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1274      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
1275      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
1276      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1277      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1278      tmp1 = emit_fetch( bld, inst, 0, CHAN_W );
1279      tmp2 = emit_fetch( bld, inst, 1, CHAN_W );
1280      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1281      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1282      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1283         dst0[chan_index] = tmp0;
1284      }
1285      break;
1286
1287   case TGSI_OPCODE_DST:
1288      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
1289         dst0[CHAN_X] = bld->base.one;
1290      }
1291      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
1292         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
1293         tmp1 = emit_fetch( bld, inst, 1, CHAN_Y );
1294         dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp0, tmp1);
1295      }
1296      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
1297         dst0[CHAN_Z] = emit_fetch( bld, inst, 0, CHAN_Z );
1298      }
1299      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
1300         dst0[CHAN_W] = emit_fetch( bld, inst, 1, CHAN_W );
1301      }
1302      break;
1303
1304   case TGSI_OPCODE_MIN:
1305      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1306         src0 = emit_fetch( bld, inst, 0, chan_index );
1307         src1 = emit_fetch( bld, inst, 1, chan_index );
1308         dst0[chan_index] = lp_build_min( &bld->base, src0, src1 );
1309      }
1310      break;
1311
1312   case TGSI_OPCODE_MAX:
1313      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1314         src0 = emit_fetch( bld, inst, 0, chan_index );
1315         src1 = emit_fetch( bld, inst, 1, chan_index );
1316         dst0[chan_index] = lp_build_max( &bld->base, src0, src1 );
1317      }
1318      break;
1319
1320   case TGSI_OPCODE_SLT:
1321   /* TGSI_OPCODE_SETLT */
1322      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1323         src0 = emit_fetch( bld, inst, 0, chan_index );
1324         src1 = emit_fetch( bld, inst, 1, chan_index );
1325         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, src1 );
1326         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1327      }
1328      break;
1329
1330   case TGSI_OPCODE_SGE:
1331   /* TGSI_OPCODE_SETGE */
1332      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1333         src0 = emit_fetch( bld, inst, 0, chan_index );
1334         src1 = emit_fetch( bld, inst, 1, chan_index );
1335         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GEQUAL, src0, src1 );
1336         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1337      }
1338      break;
1339
1340   case TGSI_OPCODE_MAD:
1341   /* TGSI_OPCODE_MADD */
1342      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1343         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1344         tmp1 = emit_fetch( bld, inst, 1, chan_index );
1345         tmp2 = emit_fetch( bld, inst, 2, chan_index );
1346         tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1347         tmp0 = lp_build_add( &bld->base, tmp0, tmp2);
1348         dst0[chan_index] = tmp0;
1349      }
1350      break;
1351
1352   case TGSI_OPCODE_SUB:
1353      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1354         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1355         tmp1 = emit_fetch( bld, inst, 1, chan_index );
1356         dst0[chan_index] = lp_build_sub( &bld->base, tmp0, tmp1);
1357      }
1358      break;
1359
1360   case TGSI_OPCODE_LRP:
1361      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1362         src0 = emit_fetch( bld, inst, 0, chan_index );
1363         src1 = emit_fetch( bld, inst, 1, chan_index );
1364         src2 = emit_fetch( bld, inst, 2, chan_index );
1365         tmp0 = lp_build_sub( &bld->base, src1, src2 );
1366         tmp0 = lp_build_mul( &bld->base, src0, tmp0 );
1367         dst0[chan_index] = lp_build_add( &bld->base, tmp0, src2 );
1368      }
1369      break;
1370
1371   case TGSI_OPCODE_CND:
1372      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1373         src0 = emit_fetch( bld, inst, 0, chan_index );
1374         src1 = emit_fetch( bld, inst, 1, chan_index );
1375         src2 = emit_fetch( bld, inst, 2, chan_index );
1376         tmp1 = lp_build_const_vec(bld->base.type, 0.5);
1377         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
1378         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
1379      }
1380      break;
1381
1382   case TGSI_OPCODE_DP2A:
1383      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
1384      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
1385      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
1386      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
1387      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
1388      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
1389      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
1390      tmp1 = emit_fetch( bld, inst, 2, CHAN_X );  /* xmm1 = src[2].x */
1391      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
1392      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1393         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
1394      }
1395      break;
1396
1397   case TGSI_OPCODE_FRC:
1398      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1399         src0 = emit_fetch( bld, inst, 0, chan_index );
1400         tmp0 = lp_build_floor(&bld->base, src0);
1401         tmp0 = lp_build_sub(&bld->base, src0, tmp0);
1402         dst0[chan_index] = tmp0;
1403      }
1404      break;
1405
1406   case TGSI_OPCODE_CLAMP:
1407      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1408         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1409         src1 = emit_fetch( bld, inst, 1, chan_index );
1410         src2 = emit_fetch( bld, inst, 2, chan_index );
1411         tmp0 = lp_build_max(&bld->base, tmp0, src1);
1412         tmp0 = lp_build_min(&bld->base, tmp0, src2);
1413         dst0[chan_index] = tmp0;
1414      }
1415      break;
1416
1417   case TGSI_OPCODE_FLR:
1418      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1419         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1420         dst0[chan_index] = lp_build_floor(&bld->base, tmp0);
1421      }
1422      break;
1423
1424   case TGSI_OPCODE_ROUND:
1425      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1426         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1427         dst0[chan_index] = lp_build_round(&bld->base, tmp0);
1428      }
1429      break;
1430
1431   case TGSI_OPCODE_EX2: {
1432      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1433      tmp0 = lp_build_exp2( &bld->base, tmp0);
1434      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1435         dst0[chan_index] = tmp0;
1436      }
1437      break;
1438   }
1439
1440   case TGSI_OPCODE_LG2:
1441      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1442      tmp0 = lp_build_log2( &bld->base, tmp0);
1443      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1444         dst0[chan_index] = tmp0;
1445      }
1446      break;
1447
1448   case TGSI_OPCODE_POW:
1449      src0 = emit_fetch( bld, inst, 0, CHAN_X );
1450      src1 = emit_fetch( bld, inst, 1, CHAN_X );
1451      res = lp_build_pow( &bld->base, src0, src1 );
1452      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1453         dst0[chan_index] = res;
1454      }
1455      break;
1456
1457   case TGSI_OPCODE_XPD:
1458      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1459          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
1460         tmp1 = emit_fetch( bld, inst, 1, CHAN_Z );
1461         tmp3 = emit_fetch( bld, inst, 0, CHAN_Z );
1462      }
1463      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
1464          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
1465         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
1466         tmp4 = emit_fetch( bld, inst, 1, CHAN_Y );
1467      }
1468      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
1469         tmp2 = tmp0;
1470         tmp2 = lp_build_mul( &bld->base, tmp2, tmp1);
1471         tmp5 = tmp3;
1472         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
1473         tmp2 = lp_build_sub( &bld->base, tmp2, tmp5);
1474         dst0[CHAN_X] = tmp2;
1475      }
1476      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
1477          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
1478         tmp2 = emit_fetch( bld, inst, 1, CHAN_X );
1479         tmp5 = emit_fetch( bld, inst, 0, CHAN_X );
1480      }
1481      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
1482         tmp3 = lp_build_mul( &bld->base, tmp3, tmp2);
1483         tmp1 = lp_build_mul( &bld->base, tmp1, tmp5);
1484         tmp3 = lp_build_sub( &bld->base, tmp3, tmp1);
1485         dst0[CHAN_Y] = tmp3;
1486      }
1487      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
1488         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
1489         tmp0 = lp_build_mul( &bld->base, tmp0, tmp2);
1490         tmp5 = lp_build_sub( &bld->base, tmp5, tmp0);
1491         dst0[CHAN_Z] = tmp5;
1492      }
1493      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
1494         dst0[CHAN_W] = bld->base.one;
1495      }
1496      break;
1497
1498   case TGSI_OPCODE_ABS:
1499      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1500         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1501         dst0[chan_index] = lp_build_abs( &bld->base, tmp0 );
1502      }
1503      break;
1504
1505   case TGSI_OPCODE_RCC:
1506      /* deprecated? */
1507      assert(0);
1508      return FALSE;
1509
1510   case TGSI_OPCODE_DPH:
1511      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1512      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
1513      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
1514      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
1515      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
1516      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1517      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1518      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
1519      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
1520      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
1521      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1522      tmp1 = emit_fetch( bld, inst, 1, CHAN_W );
1523      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1524      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1525         dst0[chan_index] = tmp0;
1526      }
1527      break;
1528
1529   case TGSI_OPCODE_COS:
1530      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1531      tmp0 = lp_build_cos( &bld->base, tmp0 );
1532      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1533         dst0[chan_index] = tmp0;
1534      }
1535      break;
1536
1537   case TGSI_OPCODE_DDX:
1538      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1539         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, &dst0[chan_index], NULL);
1540      }
1541      break;
1542
1543   case TGSI_OPCODE_DDY:
1544      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1545         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, NULL, &dst0[chan_index]);
1546      }
1547      break;
1548
1549   case TGSI_OPCODE_KILP:
1550      /* predicated kill */
1551      emit_kilp( bld, inst );
1552      break;
1553
1554   case TGSI_OPCODE_KIL:
1555      /* conditional kill */
1556      emit_kil( bld, inst );
1557      break;
1558
1559   case TGSI_OPCODE_PK2H:
1560      return FALSE;
1561      break;
1562
1563   case TGSI_OPCODE_PK2US:
1564      return FALSE;
1565      break;
1566
1567   case TGSI_OPCODE_PK4B:
1568      return FALSE;
1569      break;
1570
1571   case TGSI_OPCODE_PK4UB:
1572      return FALSE;
1573      break;
1574
1575   case TGSI_OPCODE_RFL:
1576      return FALSE;
1577      break;
1578
1579   case TGSI_OPCODE_SEQ:
1580      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1581         src0 = emit_fetch( bld, inst, 0, chan_index );
1582         src1 = emit_fetch( bld, inst, 1, chan_index );
1583         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_EQUAL, src0, src1 );
1584         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1585      }
1586      break;
1587
1588   case TGSI_OPCODE_SFL:
1589      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1590         dst0[chan_index] = bld->base.zero;
1591      }
1592      break;
1593
1594   case TGSI_OPCODE_SGT:
1595      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1596         src0 = emit_fetch( bld, inst, 0, chan_index );
1597         src1 = emit_fetch( bld, inst, 1, chan_index );
1598         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src0, src1 );
1599         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1600      }
1601      break;
1602
1603   case TGSI_OPCODE_SIN:
1604      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1605      tmp0 = lp_build_sin( &bld->base, tmp0 );
1606      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1607         dst0[chan_index] = tmp0;
1608      }
1609      break;
1610
1611   case TGSI_OPCODE_SLE:
1612      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1613         src0 = emit_fetch( bld, inst, 0, chan_index );
1614         src1 = emit_fetch( bld, inst, 1, chan_index );
1615         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LEQUAL, src0, src1 );
1616         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1617      }
1618      break;
1619
1620   case TGSI_OPCODE_SNE:
1621      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1622         src0 = emit_fetch( bld, inst, 0, chan_index );
1623         src1 = emit_fetch( bld, inst, 1, chan_index );
1624         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_NOTEQUAL, src0, src1 );
1625         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
1626      }
1627      break;
1628
1629   case TGSI_OPCODE_STR:
1630      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1631         dst0[chan_index] = bld->base.one;
1632      }
1633      break;
1634
1635   case TGSI_OPCODE_TEX:
1636      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_NONE, dst0 );
1637      break;
1638
1639   case TGSI_OPCODE_TXD:
1640      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV, dst0 );
1641      break;
1642
1643   case TGSI_OPCODE_UP2H:
1644      /* deprecated */
1645      assert (0);
1646      return FALSE;
1647      break;
1648
1649   case TGSI_OPCODE_UP2US:
1650      /* deprecated */
1651      assert(0);
1652      return FALSE;
1653      break;
1654
1655   case TGSI_OPCODE_UP4B:
1656      /* deprecated */
1657      assert(0);
1658      return FALSE;
1659      break;
1660
1661   case TGSI_OPCODE_UP4UB:
1662      /* deprecated */
1663      assert(0);
1664      return FALSE;
1665      break;
1666
1667   case TGSI_OPCODE_X2D:
1668      /* deprecated? */
1669      assert(0);
1670      return FALSE;
1671      break;
1672
1673   case TGSI_OPCODE_ARA:
1674      /* deprecated */
1675      assert(0);
1676      return FALSE;
1677      break;
1678
1679   case TGSI_OPCODE_ARR:
1680      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1681         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1682         tmp0 = lp_build_round(&bld->base, tmp0);
1683         dst0[chan_index] = tmp0;
1684      }
1685      break;
1686
1687   case TGSI_OPCODE_BRA:
1688      /* deprecated */
1689      assert(0);
1690      return FALSE;
1691      break;
1692
1693   case TGSI_OPCODE_CAL:
1694      lp_exec_mask_call(&bld->exec_mask,
1695                        inst->Label.Label,
1696                        pc);
1697
1698      break;
1699
1700   case TGSI_OPCODE_RET:
1701      lp_exec_mask_ret(&bld->exec_mask, pc);
1702      break;
1703
1704   case TGSI_OPCODE_END:
1705      *pc = -1;
1706      break;
1707
1708   case TGSI_OPCODE_SSG:
1709   /* TGSI_OPCODE_SGN */
1710      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1711         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1712         dst0[chan_index] = lp_build_sgn( &bld->base, tmp0 );
1713      }
1714      break;
1715
1716   case TGSI_OPCODE_CMP:
1717      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1718         src0 = emit_fetch( bld, inst, 0, chan_index );
1719         src1 = emit_fetch( bld, inst, 1, chan_index );
1720         src2 = emit_fetch( bld, inst, 2, chan_index );
1721         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, bld->base.zero );
1722         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src1, src2);
1723      }
1724      break;
1725
1726   case TGSI_OPCODE_SCS:
1727      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
1728         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1729         dst0[CHAN_X] = lp_build_cos( &bld->base, tmp0 );
1730      }
1731      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
1732         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
1733         dst0[CHAN_Y] = lp_build_sin( &bld->base, tmp0 );
1734      }
1735      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
1736         dst0[CHAN_Z] = bld->base.zero;
1737      }
1738      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
1739         dst0[CHAN_W] = bld->base.one;
1740      }
1741      break;
1742
1743   case TGSI_OPCODE_TXB:
1744      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS, dst0 );
1745      break;
1746
1747   case TGSI_OPCODE_NRM:
1748      /* fall-through */
1749   case TGSI_OPCODE_NRM4:
1750      /* 3 or 4-component normalization */
1751      {
1752         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
1753
1754         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) ||
1755             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y) ||
1756             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z) ||
1757             (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 4)) {
1758
1759            /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
1760
1761            /* xmm4 = src.x */
1762            /* xmm0 = src.x * src.x */
1763            tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
1764            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
1765               tmp4 = tmp0;
1766            }
1767            tmp0 = lp_build_mul( &bld->base, tmp0, tmp0);
1768
1769            /* xmm5 = src.y */
1770            /* xmm0 = xmm0 + src.y * src.y */
1771            tmp1 = emit_fetch(bld, inst, 0, CHAN_Y);
1772            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
1773               tmp5 = tmp1;
1774            }
1775            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
1776            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1777
1778            /* xmm6 = src.z */
1779            /* xmm0 = xmm0 + src.z * src.z */
1780            tmp1 = emit_fetch(bld, inst, 0, CHAN_Z);
1781            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
1782               tmp6 = tmp1;
1783            }
1784            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
1785            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1786
1787            if (dims == 4) {
1788               /* xmm7 = src.w */
1789               /* xmm0 = xmm0 + src.w * src.w */
1790               tmp1 = emit_fetch(bld, inst, 0, CHAN_W);
1791               if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W)) {
1792                  tmp7 = tmp1;
1793               }
1794               tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
1795               tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
1796            }
1797
1798            /* xmm1 = 1 / sqrt(xmm0) */
1799            tmp1 = lp_build_rsqrt( &bld->base, tmp0);
1800
1801            /* dst.x = xmm1 * src.x */
1802            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
1803               dst0[CHAN_X] = lp_build_mul( &bld->base, tmp4, tmp1);
1804            }
1805
1806            /* dst.y = xmm1 * src.y */
1807            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
1808               dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp5, tmp1);
1809            }
1810
1811            /* dst.z = xmm1 * src.z */
1812            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
1813               dst0[CHAN_Z] = lp_build_mul( &bld->base, tmp6, tmp1);
1814            }
1815
1816            /* dst.w = xmm1 * src.w */
1817            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) && dims == 4) {
1818               dst0[CHAN_W] = lp_build_mul( &bld->base, tmp7, tmp1);
1819            }
1820         }
1821
1822         /* dst.w = 1.0 */
1823         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 3) {
1824            dst0[CHAN_W] = bld->base.one;
1825         }
1826      }
1827      break;
1828
1829   case TGSI_OPCODE_DIV:
1830      /* deprecated */
1831      assert( 0 );
1832      return FALSE;
1833      break;
1834
1835   case TGSI_OPCODE_DP2:
1836      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
1837      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
1838      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
1839      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
1840      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
1841      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
1842      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
1843      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1844         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
1845      }
1846      break;
1847
1848   case TGSI_OPCODE_TXL:
1849      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD, dst0 );
1850      break;
1851
1852   case TGSI_OPCODE_TXP:
1853      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_PROJECTED, dst0 );
1854      break;
1855
1856   case TGSI_OPCODE_BRK:
1857      lp_exec_break(&bld->exec_mask);
1858      break;
1859
1860   case TGSI_OPCODE_IF:
1861      tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
1862      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_NOTEQUAL,
1863                          tmp0, bld->base.zero);
1864      lp_exec_mask_cond_push(&bld->exec_mask, tmp0);
1865      break;
1866
1867   case TGSI_OPCODE_BGNLOOP:
1868      lp_exec_bgnloop(&bld->exec_mask);
1869      break;
1870
1871   case TGSI_OPCODE_BGNSUB:
1872      lp_exec_mask_bgnsub(&bld->exec_mask);
1873      break;
1874
1875   case TGSI_OPCODE_ELSE:
1876      lp_exec_mask_cond_invert(&bld->exec_mask);
1877      break;
1878
1879   case TGSI_OPCODE_ENDIF:
1880      lp_exec_mask_cond_pop(&bld->exec_mask);
1881      break;
1882
1883   case TGSI_OPCODE_ENDLOOP:
1884      lp_exec_endloop(&bld->exec_mask);
1885      break;
1886
1887   case TGSI_OPCODE_ENDSUB:
1888      lp_exec_mask_endsub(&bld->exec_mask, pc);
1889      break;
1890
1891   case TGSI_OPCODE_PUSHA:
1892      /* deprecated? */
1893      assert(0);
1894      return FALSE;
1895      break;
1896
1897   case TGSI_OPCODE_POPA:
1898      /* deprecated? */
1899      assert(0);
1900      return FALSE;
1901      break;
1902
1903   case TGSI_OPCODE_CEIL:
1904      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1905         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1906         dst0[chan_index] = lp_build_ceil(&bld->base, tmp0);
1907      }
1908      break;
1909
1910   case TGSI_OPCODE_I2F:
1911      /* deprecated? */
1912      assert(0);
1913      return FALSE;
1914      break;
1915
1916   case TGSI_OPCODE_NOT:
1917      /* deprecated? */
1918      assert(0);
1919      return FALSE;
1920      break;
1921
1922   case TGSI_OPCODE_TRUNC:
1923      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
1924         tmp0 = emit_fetch( bld, inst, 0, chan_index );
1925         dst0[chan_index] = lp_build_trunc(&bld->base, tmp0);
1926      }
1927      break;
1928
1929   case TGSI_OPCODE_SHL:
1930      /* deprecated? */
1931      assert(0);
1932      return FALSE;
1933      break;
1934
1935   case TGSI_OPCODE_ISHR:
1936      /* deprecated? */
1937      assert(0);
1938      return FALSE;
1939      break;
1940
1941   case TGSI_OPCODE_AND:
1942      /* deprecated? */
1943      assert(0);
1944      return FALSE;
1945      break;
1946
1947   case TGSI_OPCODE_OR:
1948      /* deprecated? */
1949      assert(0);
1950      return FALSE;
1951      break;
1952
1953   case TGSI_OPCODE_MOD:
1954      /* deprecated? */
1955      assert(0);
1956      return FALSE;
1957      break;
1958
1959   case TGSI_OPCODE_XOR:
1960      /* deprecated? */
1961      assert(0);
1962      return FALSE;
1963      break;
1964
1965   case TGSI_OPCODE_SAD:
1966      /* deprecated? */
1967      assert(0);
1968      return FALSE;
1969      break;
1970
1971   case TGSI_OPCODE_TXF:
1972      /* deprecated? */
1973      assert(0);
1974      return FALSE;
1975      break;
1976
1977   case TGSI_OPCODE_TXQ:
1978      /* deprecated? */
1979      assert(0);
1980      return FALSE;
1981      break;
1982
1983   case TGSI_OPCODE_CONT:
1984      lp_exec_continue(&bld->exec_mask);
1985      break;
1986
1987   case TGSI_OPCODE_EMIT:
1988      return FALSE;
1989      break;
1990
1991   case TGSI_OPCODE_ENDPRIM:
1992      return FALSE;
1993      break;
1994
1995   case TGSI_OPCODE_NOP:
1996      break;
1997
1998   default:
1999      return FALSE;
2000   }
2001
2002   if(info->num_dst) {
2003      LLVMValueRef pred[NUM_CHANNELS];
2004
2005      emit_fetch_predicate( bld, inst, pred );
2006
2007      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
2008         emit_store( bld, inst, 0, chan_index, pred[chan_index], dst0[chan_index]);
2009      }
2010   }
2011
2012   return TRUE;
2013}
2014
2015
2016void
2017lp_build_tgsi_soa(LLVMBuilderRef builder,
2018                  const struct tgsi_token *tokens,
2019                  struct lp_type type,
2020                  struct lp_build_mask_context *mask,
2021                  LLVMValueRef consts_ptr,
2022                  const LLVMValueRef *pos,
2023                  const LLVMValueRef (*inputs)[NUM_CHANNELS],
2024                  LLVMValueRef (*outputs)[NUM_CHANNELS],
2025                  struct lp_build_sampler_soa *sampler,
2026                  const struct tgsi_shader_info *info)
2027{
2028   struct lp_build_tgsi_soa_context bld;
2029   struct tgsi_parse_context parse;
2030   uint num_immediates = 0;
2031   uint num_instructions = 0;
2032   unsigned i;
2033   int pc = 0;
2034
2035   struct lp_type res_type;
2036
2037   assert(type.length <= LP_MAX_VECTOR_LENGTH);
2038   memset(&res_type, 0, sizeof res_type);
2039   res_type.width = type.width;
2040   res_type.length = type.length;
2041   res_type.sign = 1;
2042
2043   /* Setup build context */
2044   memset(&bld, 0, sizeof bld);
2045   lp_build_context_init(&bld.base, builder, type);
2046   lp_build_context_init(&bld.uint_bld, builder, lp_uint_type(type));
2047   bld.mask = mask;
2048   bld.pos = pos;
2049   bld.inputs = inputs;
2050   bld.outputs = outputs;
2051   bld.consts_ptr = consts_ptr;
2052   bld.sampler = sampler;
2053   bld.info = info;
2054   bld.indirect_files = info->indirect_files;
2055   bld.instructions = (struct tgsi_full_instruction *)
2056                      MALLOC( LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction) );
2057   bld.max_instructions = LP_MAX_INSTRUCTIONS;
2058
2059   if (!bld.instructions) {
2060      return;
2061   }
2062
2063   lp_exec_mask_init(&bld.exec_mask, &bld.base);
2064
2065   tgsi_parse_init( &parse, tokens );
2066
2067   while( !tgsi_parse_end_of_tokens( &parse ) ) {
2068      tgsi_parse_token( &parse );
2069
2070      switch( parse.FullToken.Token.Type ) {
2071      case TGSI_TOKEN_TYPE_DECLARATION:
2072         /* Inputs already interpolated */
2073         emit_declaration( &bld, &parse.FullToken.FullDeclaration );
2074         break;
2075
2076      case TGSI_TOKEN_TYPE_INSTRUCTION:
2077         {
2078            /* save expanded instruction */
2079            if (num_instructions == bld.max_instructions) {
2080               struct tgsi_full_instruction *instructions;
2081               instructions = REALLOC(bld.instructions,
2082                                      bld.max_instructions
2083                                      * sizeof(struct tgsi_full_instruction),
2084                                      (bld.max_instructions + LP_MAX_INSTRUCTIONS)
2085                                      * sizeof(struct tgsi_full_instruction));
2086               if (!instructions) {
2087                  break;
2088               }
2089               bld.instructions = instructions;
2090               bld.max_instructions += LP_MAX_INSTRUCTIONS;
2091            }
2092
2093            memcpy(bld.instructions + num_instructions,
2094                   &parse.FullToken.FullInstruction,
2095                   sizeof(bld.instructions[0]));
2096
2097            num_instructions++;
2098         }
2099
2100         break;
2101
2102      case TGSI_TOKEN_TYPE_IMMEDIATE:
2103         /* simply copy the immediate values into the next immediates[] slot */
2104         {
2105            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2106            assert(size <= 4);
2107            assert(num_immediates < LP_MAX_TGSI_IMMEDIATES);
2108            for( i = 0; i < size; ++i )
2109               bld.immediates[num_immediates][i] =
2110                  lp_build_const_vec(type, parse.FullToken.FullImmediate.u[i].Float);
2111            for( i = size; i < 4; ++i )
2112               bld.immediates[num_immediates][i] = bld.base.undef;
2113            num_immediates++;
2114         }
2115         break;
2116
2117      case TGSI_TOKEN_TYPE_PROPERTY:
2118         break;
2119
2120      default:
2121         assert( 0 );
2122      }
2123   }
2124
2125   while (pc != -1) {
2126      struct tgsi_full_instruction *instr = bld.instructions + pc;
2127      const struct tgsi_opcode_info *opcode_info =
2128         tgsi_get_opcode_info(instr->Instruction.Opcode);
2129      if (!emit_instruction( &bld, instr, opcode_info, &pc ))
2130         _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
2131                       opcode_info->mnemonic);
2132   }
2133
2134   if (0) {
2135      LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
2136      LLVMValueRef function = LLVMGetBasicBlockParent(block);
2137      debug_printf("11111111111111111111111111111 \n");
2138      tgsi_dump(tokens, 0);
2139      lp_debug_dump_value(function);
2140      debug_printf("2222222222222222222222222222 \n");
2141   }
2142   tgsi_parse_free( &parse );
2143
2144   if (0) {
2145      LLVMModuleRef module = LLVMGetGlobalParent(
2146         LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld.base.builder)));
2147      LLVMDumpModule(module);
2148
2149   }
2150
2151   FREE( bld.instructions );
2152}
2153
2154