brw_fs.cpp revision 3c97c00e3810d31c3aa26173eb9fdef91b3e7c87
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44#include "talloc.h"
45}
46#include "../glsl/glsl_types.h"
47#include "../glsl/ir_optimization.h"
48#include "../glsl/ir_print_visitor.h"
49
50enum register_file {
51   ARF = BRW_ARCHITECTURE_REGISTER_FILE,
52   GRF = BRW_GENERAL_REGISTER_FILE,
53   MRF = BRW_MESSAGE_REGISTER_FILE,
54   IMM = BRW_IMMEDIATE_VALUE,
55   FIXED_HW_REG, /* a struct brw_reg */
56   UNIFORM, /* prog_data->params[hw_reg] */
57   BAD_FILE
58};
59
60enum fs_opcodes {
61   FS_OPCODE_FB_WRITE = 256,
62   FS_OPCODE_RCP,
63   FS_OPCODE_RSQ,
64   FS_OPCODE_SQRT,
65   FS_OPCODE_EXP2,
66   FS_OPCODE_LOG2,
67   FS_OPCODE_POW,
68   FS_OPCODE_SIN,
69   FS_OPCODE_COS,
70   FS_OPCODE_DDX,
71   FS_OPCODE_DDY,
72   FS_OPCODE_LINTERP,
73   FS_OPCODE_TEX,
74   FS_OPCODE_TXB,
75   FS_OPCODE_TXL,
76   FS_OPCODE_DISCARD,
77};
78
79static int using_new_fs = -1;
80static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
81
82struct gl_shader *
83brw_new_shader(GLcontext *ctx, GLuint name, GLuint type)
84{
85   struct brw_shader *shader;
86
87   shader = talloc_zero(NULL, struct brw_shader);
88   if (shader) {
89      shader->base.Type = type;
90      shader->base.Name = name;
91      _mesa_init_shader(ctx, &shader->base);
92   }
93
94   return &shader->base;
95}
96
97struct gl_shader_program *
98brw_new_shader_program(GLcontext *ctx, GLuint name)
99{
100   struct brw_shader_program *prog;
101   prog = talloc_zero(NULL, struct brw_shader_program);
102   if (prog) {
103      prog->base.Name = name;
104      _mesa_init_shader_program(ctx, &prog->base);
105   }
106   return &prog->base;
107}
108
109GLboolean
110brw_compile_shader(GLcontext *ctx, struct gl_shader *shader)
111{
112   if (!_mesa_ir_compile_shader(ctx, shader))
113      return GL_FALSE;
114
115   return GL_TRUE;
116}
117
118GLboolean
119brw_link_shader(GLcontext *ctx, struct gl_shader_program *prog)
120{
121   if (using_new_fs == -1)
122      using_new_fs = getenv("INTEL_NEW_FS") != NULL;
123
124   for (unsigned i = 0; i < prog->_NumLinkedShaders; i++) {
125      struct brw_shader *shader = (struct brw_shader *)prog->_LinkedShaders[i];
126
127      if (using_new_fs && shader->base.Type == GL_FRAGMENT_SHADER) {
128	 void *mem_ctx = talloc_new(NULL);
129	 bool progress;
130
131	 if (shader->ir)
132	    talloc_free(shader->ir);
133	 shader->ir = new(shader) exec_list;
134	 clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
135
136	 do_mat_op_to_vec(shader->ir);
137	 do_mod_to_fract(shader->ir);
138	 do_div_to_mul_rcp(shader->ir);
139	 do_sub_to_add_neg(shader->ir);
140	 do_explog_to_explog2(shader->ir);
141	 do_lower_texture_projection(shader->ir);
142
143	 do {
144	    progress = false;
145
146	    brw_do_channel_expressions(shader->ir);
147	    brw_do_vector_splitting(shader->ir);
148
149	    progress = do_lower_jumps(shader->ir, true, true,
150				      true, /* main return */
151				      false, /* continue */
152				      false /* loops */
153				      ) || progress;
154
155	    progress = do_common_optimization(shader->ir, true, 32) || progress;
156
157	    progress = lower_noise(shader->ir) || progress;
158	    progress =
159	       lower_variable_index_to_cond_assign(shader->ir,
160						   GL_TRUE, /* input */
161						   GL_TRUE, /* output */
162						   GL_TRUE, /* temp */
163						   GL_TRUE /* uniform */
164						   ) || progress;
165	 } while (progress);
166
167	 validate_ir_tree(shader->ir);
168
169	 reparent_ir(shader->ir, shader->ir);
170	 talloc_free(mem_ctx);
171      }
172   }
173
174   if (!_mesa_ir_link_shader(ctx, prog))
175      return GL_FALSE;
176
177   return GL_TRUE;
178}
179
180static int
181type_size(const struct glsl_type *type)
182{
183   unsigned int size, i;
184
185   switch (type->base_type) {
186   case GLSL_TYPE_UINT:
187   case GLSL_TYPE_INT:
188   case GLSL_TYPE_FLOAT:
189   case GLSL_TYPE_BOOL:
190      return type->components();
191   case GLSL_TYPE_ARRAY:
192      return type_size(type->fields.array) * type->length;
193   case GLSL_TYPE_STRUCT:
194      size = 0;
195      for (i = 0; i < type->length; i++) {
196	 size += type_size(type->fields.structure[i].type);
197      }
198      return size;
199   case GLSL_TYPE_SAMPLER:
200      /* Samplers take up no register space, since they're baked in at
201       * link time.
202       */
203      return 0;
204   default:
205      assert(!"not reached");
206      return 0;
207   }
208}
209
210class fs_reg {
211public:
212   /* Callers of this talloc-based new need not call delete. It's
213    * easier to just talloc_free 'ctx' (or any of its ancestors). */
214   static void* operator new(size_t size, void *ctx)
215   {
216      void *node;
217
218      node = talloc_size(ctx, size);
219      assert(node != NULL);
220
221      return node;
222   }
223
224   void init()
225   {
226      this->reg = 0;
227      this->reg_offset = 0;
228      this->negate = 0;
229      this->abs = 0;
230      this->hw_reg = -1;
231   }
232
233   /** Generic unset register constructor. */
234   fs_reg()
235   {
236      init();
237      this->file = BAD_FILE;
238   }
239
240   /** Immediate value constructor. */
241   fs_reg(float f)
242   {
243      init();
244      this->file = IMM;
245      this->type = BRW_REGISTER_TYPE_F;
246      this->imm.f = f;
247   }
248
249   /** Immediate value constructor. */
250   fs_reg(int32_t i)
251   {
252      init();
253      this->file = IMM;
254      this->type = BRW_REGISTER_TYPE_D;
255      this->imm.i = i;
256   }
257
258   /** Immediate value constructor. */
259   fs_reg(uint32_t u)
260   {
261      init();
262      this->file = IMM;
263      this->type = BRW_REGISTER_TYPE_UD;
264      this->imm.u = u;
265   }
266
267   /** Fixed brw_reg Immediate value constructor. */
268   fs_reg(struct brw_reg fixed_hw_reg)
269   {
270      init();
271      this->file = FIXED_HW_REG;
272      this->fixed_hw_reg = fixed_hw_reg;
273      this->type = fixed_hw_reg.type;
274   }
275
276   fs_reg(enum register_file file, int hw_reg);
277   fs_reg(class fs_visitor *v, const struct glsl_type *type);
278
279   /** Register file: ARF, GRF, MRF, IMM. */
280   enum register_file file;
281   /** virtual register number.  0 = fixed hw reg */
282   int reg;
283   /** Offset within the virtual register. */
284   int reg_offset;
285   /** HW register number.  Generally unset until register allocation. */
286   int hw_reg;
287   /** Register type.  BRW_REGISTER_TYPE_* */
288   int type;
289   bool negate;
290   bool abs;
291   struct brw_reg fixed_hw_reg;
292
293   /** Value for file == BRW_IMMMEDIATE_FILE */
294   union {
295      int32_t i;
296      uint32_t u;
297      float f;
298   } imm;
299};
300
301static const fs_reg reg_undef;
302static const fs_reg reg_null(ARF, BRW_ARF_NULL);
303
304class fs_inst : public exec_node {
305public:
306   /* Callers of this talloc-based new need not call delete. It's
307    * easier to just talloc_free 'ctx' (or any of its ancestors). */
308   static void* operator new(size_t size, void *ctx)
309   {
310      void *node;
311
312      node = talloc_zero_size(ctx, size);
313      assert(node != NULL);
314
315      return node;
316   }
317
318   void init()
319   {
320      this->opcode = BRW_OPCODE_NOP;
321      this->saturate = false;
322      this->conditional_mod = BRW_CONDITIONAL_NONE;
323      this->predicated = false;
324      this->sampler = 0;
325      this->target = 0;
326      this->eot = false;
327      this->header_present = false;
328      this->shadow_compare = false;
329   }
330
331   fs_inst()
332   {
333      init();
334   }
335
336   fs_inst(int opcode)
337   {
338      init();
339      this->opcode = opcode;
340   }
341
342   fs_inst(int opcode, fs_reg dst, fs_reg src0)
343   {
344      init();
345      this->opcode = opcode;
346      this->dst = dst;
347      this->src[0] = src0;
348   }
349
350   fs_inst(int opcode, fs_reg dst, fs_reg src0, fs_reg src1)
351   {
352      init();
353      this->opcode = opcode;
354      this->dst = dst;
355      this->src[0] = src0;
356      this->src[1] = src1;
357   }
358
359   fs_inst(int opcode, fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)
360   {
361      init();
362      this->opcode = opcode;
363      this->dst = dst;
364      this->src[0] = src0;
365      this->src[1] = src1;
366      this->src[2] = src2;
367   }
368
369   int opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
370   fs_reg dst;
371   fs_reg src[3];
372   bool saturate;
373   bool predicated;
374   int conditional_mod; /**< BRW_CONDITIONAL_* */
375
376   int mlen; /**< SEND message length */
377   int sampler;
378   int target; /**< MRT target. */
379   bool eot;
380   bool header_present;
381   bool shadow_compare;
382
383   /** @{
384    * Annotation for the generated IR.  One of the two can be set.
385    */
386   ir_instruction *ir;
387   const char *annotation;
388   /** @} */
389};
390
391class fs_visitor : public ir_visitor
392{
393public:
394
395   fs_visitor(struct brw_wm_compile *c, struct brw_shader *shader)
396   {
397      this->c = c;
398      this->p = &c->func;
399      this->brw = p->brw;
400      this->fp = brw->fragment_program;
401      this->intel = &brw->intel;
402      this->ctx = &intel->ctx;
403      this->mem_ctx = talloc_new(NULL);
404      this->shader = shader;
405      this->fail = false;
406      this->variable_ht = hash_table_ctor(0,
407					  hash_table_pointer_hash,
408					  hash_table_pointer_compare);
409
410      this->frag_color = NULL;
411      this->frag_data = NULL;
412      this->frag_depth = NULL;
413      this->first_non_payload_grf = 0;
414
415      this->current_annotation = NULL;
416      this->annotation_string = NULL;
417      this->annotation_ir = NULL;
418      this->base_ir = NULL;
419
420      this->virtual_grf_sizes = NULL;
421      this->virtual_grf_next = 1;
422      this->virtual_grf_array_size = 0;
423      this->virtual_grf_def = NULL;
424      this->virtual_grf_use = NULL;
425
426      this->kill_emitted = false;
427   }
428
429   ~fs_visitor()
430   {
431      talloc_free(this->mem_ctx);
432      hash_table_dtor(this->variable_ht);
433   }
434
435   fs_reg *variable_storage(ir_variable *var);
436   int virtual_grf_alloc(int size);
437
438   void visit(ir_variable *ir);
439   void visit(ir_assignment *ir);
440   void visit(ir_dereference_variable *ir);
441   void visit(ir_dereference_record *ir);
442   void visit(ir_dereference_array *ir);
443   void visit(ir_expression *ir);
444   void visit(ir_texture *ir);
445   void visit(ir_if *ir);
446   void visit(ir_constant *ir);
447   void visit(ir_swizzle *ir);
448   void visit(ir_return *ir);
449   void visit(ir_loop *ir);
450   void visit(ir_loop_jump *ir);
451   void visit(ir_discard *ir);
452   void visit(ir_call *ir);
453   void visit(ir_function *ir);
454   void visit(ir_function_signature *ir);
455
456   fs_inst *emit(fs_inst inst);
457   void assign_curb_setup();
458   void calculate_urb_setup();
459   void assign_urb_setup();
460   void assign_regs();
461   void assign_regs_trivial();
462   void calculate_live_intervals();
463   bool propagate_constants();
464   bool dead_code_eliminate();
465   bool virtual_grf_interferes(int a, int b);
466   void generate_code();
467   void generate_fb_write(fs_inst *inst);
468   void generate_linterp(fs_inst *inst, struct brw_reg dst,
469			 struct brw_reg *src);
470   void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src);
471   void generate_math(fs_inst *inst, struct brw_reg dst, struct brw_reg *src);
472   void generate_discard(fs_inst *inst, struct brw_reg temp);
473   void generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src);
474   void generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src);
475
476   void emit_dummy_fs();
477   void emit_fragcoord_interpolation(ir_variable *ir);
478   void emit_general_interpolation(ir_variable *ir);
479   void emit_interpolation_setup_gen4();
480   void emit_interpolation_setup_gen6();
481   fs_inst *emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate);
482   fs_inst *emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate);
483   void emit_fb_writes();
484   void emit_assignment_writes(fs_reg &l, fs_reg &r,
485			       const glsl_type *type, bool predicated);
486
487   struct brw_reg interp_reg(int location, int channel);
488   int setup_uniform_values(int loc, const glsl_type *type);
489   void setup_builtin_uniform_values(ir_variable *ir);
490
491   struct brw_context *brw;
492   const struct gl_fragment_program *fp;
493   struct intel_context *intel;
494   GLcontext *ctx;
495   struct brw_wm_compile *c;
496   struct brw_compile *p;
497   struct brw_shader *shader;
498   void *mem_ctx;
499   exec_list instructions;
500
501   int *virtual_grf_sizes;
502   int virtual_grf_next;
503   int virtual_grf_array_size;
504   int *virtual_grf_def;
505   int *virtual_grf_use;
506
507   struct hash_table *variable_ht;
508   ir_variable *frag_color, *frag_data, *frag_depth;
509   int first_non_payload_grf;
510   int urb_setup[FRAG_ATTRIB_MAX];
511   bool kill_emitted;
512
513   /** @{ debug annotation info */
514   const char *current_annotation;
515   ir_instruction *base_ir;
516   const char **annotation_string;
517   ir_instruction **annotation_ir;
518   /** @} */
519
520   bool fail;
521
522   /* Result of last visit() method. */
523   fs_reg result;
524
525   fs_reg pixel_x;
526   fs_reg pixel_y;
527   fs_reg wpos_w;
528   fs_reg pixel_w;
529   fs_reg delta_x;
530   fs_reg delta_y;
531
532   int grf_used;
533
534};
535
536int
537fs_visitor::virtual_grf_alloc(int size)
538{
539   if (virtual_grf_array_size <= virtual_grf_next) {
540      if (virtual_grf_array_size == 0)
541	 virtual_grf_array_size = 16;
542      else
543	 virtual_grf_array_size *= 2;
544      virtual_grf_sizes = talloc_realloc(mem_ctx, virtual_grf_sizes,
545					 int, virtual_grf_array_size);
546
547      /* This slot is always unused. */
548      virtual_grf_sizes[0] = 0;
549   }
550   virtual_grf_sizes[virtual_grf_next] = size;
551   return virtual_grf_next++;
552}
553
554/** Fixed HW reg constructor. */
555fs_reg::fs_reg(enum register_file file, int hw_reg)
556{
557   init();
558   this->file = file;
559   this->hw_reg = hw_reg;
560   this->type = BRW_REGISTER_TYPE_F;
561}
562
563int
564brw_type_for_base_type(const struct glsl_type *type)
565{
566   switch (type->base_type) {
567   case GLSL_TYPE_FLOAT:
568      return BRW_REGISTER_TYPE_F;
569   case GLSL_TYPE_INT:
570   case GLSL_TYPE_BOOL:
571      return BRW_REGISTER_TYPE_D;
572   case GLSL_TYPE_UINT:
573      return BRW_REGISTER_TYPE_UD;
574   case GLSL_TYPE_ARRAY:
575   case GLSL_TYPE_STRUCT:
576      /* These should be overridden with the type of the member when
577       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
578       * way to trip up if we don't.
579       */
580      return BRW_REGISTER_TYPE_UD;
581   default:
582      assert(!"not reached");
583      return BRW_REGISTER_TYPE_F;
584   }
585}
586
587/** Automatic reg constructor. */
588fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
589{
590   init();
591
592   this->file = GRF;
593   this->reg = v->virtual_grf_alloc(type_size(type));
594   this->reg_offset = 0;
595   this->type = brw_type_for_base_type(type);
596}
597
598fs_reg *
599fs_visitor::variable_storage(ir_variable *var)
600{
601   return (fs_reg *)hash_table_find(this->variable_ht, var);
602}
603
604/* Our support for uniforms is piggy-backed on the struct
605 * gl_fragment_program, because that's where the values actually
606 * get stored, rather than in some global gl_shader_program uniform
607 * store.
608 */
609int
610fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
611{
612   unsigned int offset = 0;
613   float *vec_values;
614
615   if (type->is_matrix()) {
616      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
617							type->vector_elements,
618							1);
619
620      for (unsigned int i = 0; i < type->matrix_columns; i++) {
621	 offset += setup_uniform_values(loc + offset, column);
622      }
623
624      return offset;
625   }
626
627   switch (type->base_type) {
628   case GLSL_TYPE_FLOAT:
629   case GLSL_TYPE_UINT:
630   case GLSL_TYPE_INT:
631   case GLSL_TYPE_BOOL:
632      vec_values = fp->Base.Parameters->ParameterValues[loc];
633      for (unsigned int i = 0; i < type->vector_elements; i++) {
634	 c->prog_data.param[c->prog_data.nr_params++] = &vec_values[i];
635      }
636      return 1;
637
638   case GLSL_TYPE_STRUCT:
639      for (unsigned int i = 0; i < type->length; i++) {
640	 offset += setup_uniform_values(loc + offset,
641					type->fields.structure[i].type);
642      }
643      return offset;
644
645   case GLSL_TYPE_ARRAY:
646      for (unsigned int i = 0; i < type->length; i++) {
647	 offset += setup_uniform_values(loc + offset, type->fields.array);
648      }
649      return offset;
650
651   case GLSL_TYPE_SAMPLER:
652      /* The sampler takes up a slot, but we don't use any values from it. */
653      return 1;
654
655   default:
656      assert(!"not reached");
657      return 0;
658   }
659}
660
661
662/* Our support for builtin uniforms is even scarier than non-builtin.
663 * It sits on top of the PROG_STATE_VAR parameters that are
664 * automatically updated from GL context state.
665 */
666void
667fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
668{
669   const struct gl_builtin_uniform_desc *statevar = NULL;
670
671   for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
672      statevar = &_mesa_builtin_uniform_desc[i];
673      if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
674	 break;
675   }
676
677   if (!statevar->name) {
678      this->fail = true;
679      printf("Failed to find builtin uniform `%s'\n", ir->name);
680      return;
681   }
682
683   int array_count;
684   if (ir->type->is_array()) {
685      array_count = ir->type->length;
686   } else {
687      array_count = 1;
688   }
689
690   for (int a = 0; a < array_count; a++) {
691      for (unsigned int i = 0; i < statevar->num_elements; i++) {
692	 struct gl_builtin_uniform_element *element = &statevar->elements[i];
693	 int tokens[STATE_LENGTH];
694
695	 memcpy(tokens, element->tokens, sizeof(element->tokens));
696	 if (ir->type->is_array()) {
697	    tokens[1] = a;
698	 }
699
700	 /* This state reference has already been setup by ir_to_mesa,
701	  * but we'll get the same index back here.
702	  */
703	 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
704					       (gl_state_index *)tokens);
705	 float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
706
707	 /* Add each of the unique swizzles of the element as a
708	  * parameter.  This'll end up matching the expected layout of
709	  * the array/matrix/structure we're trying to fill in.
710	  */
711	 int last_swiz = -1;
712	 for (unsigned int i = 0; i < 4; i++) {
713	    int swiz = GET_SWZ(element->swizzle, i);
714	    if (swiz == last_swiz)
715	       break;
716	    last_swiz = swiz;
717
718	    c->prog_data.param[c->prog_data.nr_params++] = &vec_values[swiz];
719	 }
720      }
721   }
722}
723
724void
725fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
726{
727   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
728   fs_reg wpos = *reg;
729   fs_reg neg_y = this->pixel_y;
730   neg_y.negate = true;
731
732   /* gl_FragCoord.x */
733   if (ir->pixel_center_integer) {
734      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x));
735   } else {
736      emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)));
737   }
738   wpos.reg_offset++;
739
740   /* gl_FragCoord.y */
741   if (ir->origin_upper_left && ir->pixel_center_integer) {
742      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y));
743   } else {
744      fs_reg pixel_y = this->pixel_y;
745      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
746
747      if (!ir->origin_upper_left) {
748	 pixel_y.negate = true;
749	 offset += c->key.drawable_height - 1.0;
750      }
751
752      emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)));
753   }
754   wpos.reg_offset++;
755
756   /* gl_FragCoord.z */
757   emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
758		interp_reg(FRAG_ATTRIB_WPOS, 2)));
759   wpos.reg_offset++;
760
761   /* gl_FragCoord.w: Already set up in emit_interpolation */
762   emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w));
763
764   hash_table_insert(this->variable_ht, reg, ir);
765}
766
767
768void
769fs_visitor::emit_general_interpolation(ir_variable *ir)
770{
771   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
772   /* Interpolation is always in floating point regs. */
773   reg->type = BRW_REGISTER_TYPE_F;
774   fs_reg attr = *reg;
775
776   unsigned int array_elements;
777   const glsl_type *type;
778
779   if (ir->type->is_array()) {
780      array_elements = ir->type->length;
781      if (array_elements == 0) {
782	 this->fail = true;
783      }
784      type = ir->type->fields.array;
785   } else {
786      array_elements = 1;
787      type = ir->type;
788   }
789
790   int location = ir->location;
791   for (unsigned int i = 0; i < array_elements; i++) {
792      for (unsigned int j = 0; j < type->matrix_columns; j++) {
793	 if (urb_setup[location] == -1) {
794	    /* If there's no incoming setup data for this slot, don't
795	     * emit interpolation for it.
796	     */
797	    attr.reg_offset += type->vector_elements;
798	    location++;
799	    continue;
800	 }
801
802	 for (unsigned int c = 0; c < type->vector_elements; c++) {
803	    struct brw_reg interp = interp_reg(location, c);
804	    emit(fs_inst(FS_OPCODE_LINTERP,
805			 attr,
806			 this->delta_x,
807			 this->delta_y,
808			 fs_reg(interp)));
809	    attr.reg_offset++;
810	 }
811	 attr.reg_offset -= type->vector_elements;
812
813	 for (unsigned int c = 0; c < type->vector_elements; c++) {
814	    emit(fs_inst(BRW_OPCODE_MUL,
815			 attr,
816			 attr,
817			 this->pixel_w));
818	    attr.reg_offset++;
819	 }
820	 location++;
821      }
822   }
823
824   hash_table_insert(this->variable_ht, reg, ir);
825}
826
827void
828fs_visitor::visit(ir_variable *ir)
829{
830   fs_reg *reg = NULL;
831
832   if (variable_storage(ir))
833      return;
834
835   if (strcmp(ir->name, "gl_FragColor") == 0) {
836      this->frag_color = ir;
837   } else if (strcmp(ir->name, "gl_FragData") == 0) {
838      this->frag_data = ir;
839   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
840      this->frag_depth = ir;
841   }
842
843   if (ir->mode == ir_var_in) {
844      if (!strcmp(ir->name, "gl_FragCoord")) {
845	 emit_fragcoord_interpolation(ir);
846	 return;
847      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
848	 reg = new(this->mem_ctx) fs_reg(this, ir->type);
849	 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
850	 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
851	  * us front face
852	  */
853	 fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP,
854				      *reg,
855				      fs_reg(r1_6ud),
856				      fs_reg(1u << 31)));
857	 inst->conditional_mod = BRW_CONDITIONAL_L;
858	 emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)));
859      } else {
860	 emit_general_interpolation(ir);
861	 return;
862      }
863   }
864
865   if (ir->mode == ir_var_uniform) {
866      int param_index = c->prog_data.nr_params;
867
868      if (!strncmp(ir->name, "gl_", 3)) {
869	 setup_builtin_uniform_values(ir);
870      } else {
871	 setup_uniform_values(ir->location, ir->type);
872      }
873
874      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
875   }
876
877   if (!reg)
878      reg = new(this->mem_ctx) fs_reg(this, ir->type);
879
880   hash_table_insert(this->variable_ht, reg, ir);
881}
882
883void
884fs_visitor::visit(ir_dereference_variable *ir)
885{
886   fs_reg *reg = variable_storage(ir->var);
887   this->result = *reg;
888}
889
890void
891fs_visitor::visit(ir_dereference_record *ir)
892{
893   const glsl_type *struct_type = ir->record->type;
894
895   ir->record->accept(this);
896
897   unsigned int offset = 0;
898   for (unsigned int i = 0; i < struct_type->length; i++) {
899      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
900	 break;
901      offset += type_size(struct_type->fields.structure[i].type);
902   }
903   this->result.reg_offset += offset;
904   this->result.type = brw_type_for_base_type(ir->type);
905}
906
907void
908fs_visitor::visit(ir_dereference_array *ir)
909{
910   ir_constant *index;
911   int element_size;
912
913   ir->array->accept(this);
914   index = ir->array_index->as_constant();
915
916   element_size = type_size(ir->type);
917   this->result.type = brw_type_for_base_type(ir->type);
918
919   if (index) {
920      assert(this->result.file == UNIFORM ||
921	     (this->result.file == GRF &&
922	      this->result.reg != 0));
923      this->result.reg_offset += index->value.i[0] * element_size;
924   } else {
925      assert(!"FINISHME: non-constant array element");
926   }
927}
928
929void
930fs_visitor::visit(ir_expression *ir)
931{
932   unsigned int operand;
933   fs_reg op[2], temp;
934   fs_reg result;
935   fs_inst *inst;
936
937   for (operand = 0; operand < ir->get_num_operands(); operand++) {
938      ir->operands[operand]->accept(this);
939      if (this->result.file == BAD_FILE) {
940	 ir_print_visitor v;
941	 printf("Failed to get tree for expression operand:\n");
942	 ir->operands[operand]->accept(&v);
943	 this->fail = true;
944      }
945      op[operand] = this->result;
946
947      /* Matrix expression operands should have been broken down to vector
948       * operations already.
949       */
950      assert(!ir->operands[operand]->type->is_matrix());
951      /* And then those vector operands should have been broken down to scalar.
952       */
953      assert(!ir->operands[operand]->type->is_vector());
954   }
955
956   /* Storage for our result.  If our result goes into an assignment, it will
957    * just get copy-propagated out, so no worries.
958    */
959   this->result = fs_reg(this, ir->type);
960
961   switch (ir->operation) {
962   case ir_unop_logic_not:
963      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], fs_reg(-1)));
964      break;
965   case ir_unop_neg:
966      op[0].negate = !op[0].negate;
967      this->result = op[0];
968      break;
969   case ir_unop_abs:
970      op[0].abs = true;
971      this->result = op[0];
972      break;
973   case ir_unop_sign:
974      temp = fs_reg(this, ir->type);
975
976      emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)));
977
978      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null, op[0], fs_reg(0.0f)));
979      inst->conditional_mod = BRW_CONDITIONAL_G;
980      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)));
981      inst->predicated = true;
982
983      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null, op[0], fs_reg(0.0f)));
984      inst->conditional_mod = BRW_CONDITIONAL_L;
985      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)));
986      inst->predicated = true;
987
988      break;
989   case ir_unop_rcp:
990      emit(fs_inst(FS_OPCODE_RCP, this->result, op[0]));
991      break;
992
993   case ir_unop_exp2:
994      emit(fs_inst(FS_OPCODE_EXP2, this->result, op[0]));
995      break;
996   case ir_unop_log2:
997      emit(fs_inst(FS_OPCODE_LOG2, this->result, op[0]));
998      break;
999   case ir_unop_exp:
1000   case ir_unop_log:
1001      assert(!"not reached: should be handled by ir_explog_to_explog2");
1002      break;
1003   case ir_unop_sin:
1004      emit(fs_inst(FS_OPCODE_SIN, this->result, op[0]));
1005      break;
1006   case ir_unop_cos:
1007      emit(fs_inst(FS_OPCODE_COS, this->result, op[0]));
1008      break;
1009
1010   case ir_unop_dFdx:
1011      emit(fs_inst(FS_OPCODE_DDX, this->result, op[0]));
1012      break;
1013   case ir_unop_dFdy:
1014      emit(fs_inst(FS_OPCODE_DDY, this->result, op[0]));
1015      break;
1016
1017   case ir_binop_add:
1018      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1]));
1019      break;
1020   case ir_binop_sub:
1021      assert(!"not reached: should be handled by ir_sub_to_add_neg");
1022      break;
1023
1024   case ir_binop_mul:
1025      emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1]));
1026      break;
1027   case ir_binop_div:
1028      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
1029      break;
1030   case ir_binop_mod:
1031      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1032      break;
1033
1034   case ir_binop_less:
1035      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1036      inst->conditional_mod = BRW_CONDITIONAL_L;
1037      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
1038      break;
1039   case ir_binop_greater:
1040      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1041      inst->conditional_mod = BRW_CONDITIONAL_G;
1042      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
1043      break;
1044   case ir_binop_lequal:
1045      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1046      inst->conditional_mod = BRW_CONDITIONAL_LE;
1047      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
1048      break;
1049   case ir_binop_gequal:
1050      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1051      inst->conditional_mod = BRW_CONDITIONAL_GE;
1052      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
1053      break;
1054   case ir_binop_equal:
1055   case ir_binop_all_equal: /* same as nequal for scalars */
1056      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1057      inst->conditional_mod = BRW_CONDITIONAL_Z;
1058      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
1059      break;
1060   case ir_binop_nequal:
1061   case ir_binop_any_nequal: /* same as nequal for scalars */
1062      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1063      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1064      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
1065      break;
1066
1067   case ir_binop_logic_xor:
1068      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
1069      break;
1070
1071   case ir_binop_logic_or:
1072      emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
1073      break;
1074
1075   case ir_binop_logic_and:
1076      emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
1077      break;
1078
1079   case ir_binop_dot:
1080   case ir_binop_cross:
1081   case ir_unop_any:
1082      assert(!"not reached: should be handled by brw_fs_channel_expressions");
1083      break;
1084
1085   case ir_unop_noise:
1086      assert(!"not reached: should be handled by lower_noise");
1087      break;
1088
1089   case ir_unop_sqrt:
1090      emit(fs_inst(FS_OPCODE_SQRT, this->result, op[0]));
1091      break;
1092
1093   case ir_unop_rsq:
1094      emit(fs_inst(FS_OPCODE_RSQ, this->result, op[0]));
1095      break;
1096
1097   case ir_unop_i2f:
1098   case ir_unop_b2f:
1099   case ir_unop_b2i:
1100      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
1101      break;
1102   case ir_unop_f2i:
1103      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
1104      break;
1105   case ir_unop_f2b:
1106   case ir_unop_i2b:
1107      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f)));
1108      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1109
1110   case ir_unop_trunc:
1111      emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1112      break;
1113   case ir_unop_ceil:
1114      op[0].negate = ~op[0].negate;
1115      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1116      this->result.negate = true;
1117      break;
1118   case ir_unop_floor:
1119      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1120      break;
1121   case ir_unop_fract:
1122      inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0]));
1123      break;
1124
1125   case ir_binop_min:
1126      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1127      inst->conditional_mod = BRW_CONDITIONAL_L;
1128
1129      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
1130      inst->predicated = true;
1131      break;
1132   case ir_binop_max:
1133      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1134      inst->conditional_mod = BRW_CONDITIONAL_G;
1135
1136      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
1137      inst->predicated = true;
1138      break;
1139
1140   case ir_binop_pow:
1141      inst = emit(fs_inst(FS_OPCODE_POW, this->result, op[0], op[1]));
1142      break;
1143
1144   case ir_unop_bit_not:
1145   case ir_unop_u2f:
1146   case ir_binop_lshift:
1147   case ir_binop_rshift:
1148   case ir_binop_bit_and:
1149   case ir_binop_bit_xor:
1150   case ir_binop_bit_or:
1151      assert(!"GLSL 1.30 features unsupported");
1152      break;
1153   }
1154}
1155
1156void
1157fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1158				   const glsl_type *type, bool predicated)
1159{
1160   switch (type->base_type) {
1161   case GLSL_TYPE_FLOAT:
1162   case GLSL_TYPE_UINT:
1163   case GLSL_TYPE_INT:
1164   case GLSL_TYPE_BOOL:
1165      for (unsigned int i = 0; i < type->components(); i++) {
1166	 l.type = brw_type_for_base_type(type);
1167	 r.type = brw_type_for_base_type(type);
1168
1169	 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1170	 inst->predicated = predicated;
1171
1172	 l.reg_offset++;
1173	 r.reg_offset++;
1174      }
1175      break;
1176   case GLSL_TYPE_ARRAY:
1177      for (unsigned int i = 0; i < type->length; i++) {
1178	 emit_assignment_writes(l, r, type->fields.array, predicated);
1179      }
1180
1181   case GLSL_TYPE_STRUCT:
1182      for (unsigned int i = 0; i < type->length; i++) {
1183	 emit_assignment_writes(l, r, type->fields.structure[i].type,
1184				predicated);
1185      }
1186      break;
1187
1188   case GLSL_TYPE_SAMPLER:
1189      break;
1190
1191   default:
1192      assert(!"not reached");
1193      break;
1194   }
1195}
1196
1197void
1198fs_visitor::visit(ir_assignment *ir)
1199{
1200   struct fs_reg l, r;
1201   fs_inst *inst;
1202
1203   /* FINISHME: arrays on the lhs */
1204   ir->lhs->accept(this);
1205   l = this->result;
1206
1207   ir->rhs->accept(this);
1208   r = this->result;
1209
1210   assert(l.file != BAD_FILE);
1211   assert(r.file != BAD_FILE);
1212
1213   if (ir->condition) {
1214      /* Get the condition bool into the predicate. */
1215      ir->condition->accept(this);
1216      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null, this->result, fs_reg(0)));
1217      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1218   }
1219
1220   if (ir->lhs->type->is_scalar() ||
1221       ir->lhs->type->is_vector()) {
1222      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1223	 if (ir->write_mask & (1 << i)) {
1224	    inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1225	    if (ir->condition)
1226	       inst->predicated = true;
1227	    r.reg_offset++;
1228	 }
1229	 l.reg_offset++;
1230      }
1231   } else {
1232      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1233   }
1234}
1235
1236fs_inst *
1237fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1238{
1239   int mlen;
1240   int base_mrf = 2;
1241   bool simd16 = false;
1242   fs_reg orig_dst;
1243
1244   if (ir->shadow_comparitor) {
1245      for (mlen = 0; mlen < ir->coordinate->type->vector_elements; mlen++) {
1246	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1247		      coordinate));
1248	 coordinate.reg_offset++;
1249      }
1250      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1251      mlen = 3;
1252
1253      if (ir->op == ir_tex) {
1254	 /* There's no plain shadow compare message, so we use shadow
1255	  * compare with a bias of 0.0.
1256	  */
1257	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1258		      fs_reg(0.0f)));
1259	 mlen++;
1260      } else if (ir->op == ir_txb) {
1261	 ir->lod_info.bias->accept(this);
1262	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1263		      this->result));
1264	 mlen++;
1265      } else {
1266	 assert(ir->op == ir_txl);
1267	 ir->lod_info.lod->accept(this);
1268	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1269		      this->result));
1270	 mlen++;
1271      }
1272
1273      ir->shadow_comparitor->accept(this);
1274      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1275      mlen++;
1276   } else if (ir->op == ir_tex) {
1277      for (mlen = 0; mlen < ir->coordinate->type->vector_elements; mlen++) {
1278	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1279		      coordinate));
1280	 coordinate.reg_offset++;
1281      }
1282      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1283      mlen = 3;
1284   } else {
1285      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1286       * instructions.  We'll need to do SIMD16 here.
1287       */
1288      assert(ir->op == ir_txb || ir->op == ir_txl);
1289
1290      for (mlen = 0; mlen < ir->coordinate->type->vector_elements * 2;) {
1291	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1292		      coordinate));
1293	 coordinate.reg_offset++;
1294	 mlen++;
1295
1296	 /* The unused upper half. */
1297	 mlen++;
1298      }
1299
1300      /* lod/bias appears after u/v/r. */
1301      mlen = 6;
1302
1303      if (ir->op == ir_txb) {
1304	 ir->lod_info.bias->accept(this);
1305	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1306		      this->result));
1307	 mlen++;
1308      } else {
1309	 ir->lod_info.lod->accept(this);
1310	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1311		      this->result));
1312	 mlen++;
1313      }
1314
1315      /* The unused upper half. */
1316      mlen++;
1317
1318      /* Now, since we're doing simd16, the return is 2 interleaved
1319       * vec4s where the odd-indexed ones are junk. We'll need to move
1320       * this weirdness around to the expected layout.
1321       */
1322      simd16 = true;
1323      orig_dst = dst;
1324      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1325						       2));
1326      dst.type = BRW_REGISTER_TYPE_F;
1327   }
1328
1329   fs_inst *inst = NULL;
1330   switch (ir->op) {
1331   case ir_tex:
1332      inst = emit(fs_inst(FS_OPCODE_TEX, dst, fs_reg(MRF, base_mrf)));
1333      break;
1334   case ir_txb:
1335      inst = emit(fs_inst(FS_OPCODE_TXB, dst, fs_reg(MRF, base_mrf)));
1336      break;
1337   case ir_txl:
1338      inst = emit(fs_inst(FS_OPCODE_TXL, dst, fs_reg(MRF, base_mrf)));
1339      break;
1340   case ir_txd:
1341   case ir_txf:
1342      assert(!"GLSL 1.30 features unsupported");
1343      break;
1344   }
1345   inst->mlen = mlen;
1346
1347   if (simd16) {
1348      for (int i = 0; i < 4; i++) {
1349	 emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst));
1350	 orig_dst.reg_offset++;
1351	 dst.reg_offset += 2;
1352      }
1353   }
1354
1355   return inst;
1356}
1357
1358fs_inst *
1359fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1360{
1361   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1362    * optional parameters like shadow comparitor or LOD bias.  If
1363    * optional parameters aren't present, those base slots are
1364    * optional and don't need to be included in the message.
1365    *
1366    * We don't fill in the unnecessary slots regardless, which may
1367    * look surprising in the disassembly.
1368    */
1369   int mlen;
1370   int base_mrf = 2;
1371
1372   for (mlen = 0; mlen < ir->coordinate->type->vector_elements; mlen++) {
1373      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate));
1374      coordinate.reg_offset++;
1375   }
1376
1377   if (ir->shadow_comparitor) {
1378      mlen = MAX2(mlen, 4);
1379
1380      ir->shadow_comparitor->accept(this);
1381      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1382      mlen++;
1383   }
1384
1385   fs_inst *inst = NULL;
1386   switch (ir->op) {
1387   case ir_tex:
1388      inst = emit(fs_inst(FS_OPCODE_TEX, dst, fs_reg(MRF, base_mrf)));
1389      break;
1390   case ir_txb:
1391      ir->lod_info.bias->accept(this);
1392      mlen = MAX2(mlen, 4);
1393      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1394      mlen++;
1395
1396      inst = emit(fs_inst(FS_OPCODE_TXB, dst, fs_reg(MRF, base_mrf)));
1397      break;
1398   case ir_txl:
1399      ir->lod_info.lod->accept(this);
1400      mlen = MAX2(mlen, 4);
1401      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1402      mlen++;
1403
1404      inst = emit(fs_inst(FS_OPCODE_TXL, dst, fs_reg(MRF, base_mrf)));
1405      break;
1406   case ir_txd:
1407   case ir_txf:
1408      assert(!"GLSL 1.30 features unsupported");
1409      break;
1410   }
1411   inst->mlen = mlen;
1412
1413   return inst;
1414}
1415
1416void
1417fs_visitor::visit(ir_texture *ir)
1418{
1419   fs_inst *inst = NULL;
1420
1421   ir->coordinate->accept(this);
1422   fs_reg coordinate = this->result;
1423
1424   /* Should be lowered by do_lower_texture_projection */
1425   assert(!ir->projector);
1426
1427   /* Writemasking doesn't eliminate channels on SIMD8 texture
1428    * samples, so don't worry about them.
1429    */
1430   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1431
1432   if (intel->gen < 5) {
1433      inst = emit_texture_gen4(ir, dst, coordinate);
1434   } else {
1435      inst = emit_texture_gen5(ir, dst, coordinate);
1436   }
1437
1438   inst->sampler =
1439      _mesa_get_sampler_uniform_value(ir->sampler,
1440				      ctx->Shader.CurrentProgram,
1441				      &brw->fragment_program->Base);
1442   inst->sampler = c->fp->program.Base.SamplerUnits[inst->sampler];
1443
1444   this->result = dst;
1445
1446   if (ir->shadow_comparitor)
1447      inst->shadow_compare = true;
1448
1449   if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1450      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1451
1452      for (int i = 0; i < 4; i++) {
1453	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1454	 fs_reg l = swizzle_dst;
1455	 l.reg_offset += i;
1456
1457	 if (swiz == SWIZZLE_ZERO) {
1458	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f)));
1459	 } else if (swiz == SWIZZLE_ONE) {
1460	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f)));
1461	 } else {
1462	    fs_reg r = dst;
1463	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1464	    emit(fs_inst(BRW_OPCODE_MOV, l, r));
1465	 }
1466      }
1467      this->result = swizzle_dst;
1468   }
1469}
1470
1471void
1472fs_visitor::visit(ir_swizzle *ir)
1473{
1474   ir->val->accept(this);
1475   fs_reg val = this->result;
1476
1477   if (ir->type->vector_elements == 1) {
1478      this->result.reg_offset += ir->mask.x;
1479      return;
1480   }
1481
1482   fs_reg result = fs_reg(this, ir->type);
1483   this->result = result;
1484
1485   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1486      fs_reg channel = val;
1487      int swiz = 0;
1488
1489      switch (i) {
1490      case 0:
1491	 swiz = ir->mask.x;
1492	 break;
1493      case 1:
1494	 swiz = ir->mask.y;
1495	 break;
1496      case 2:
1497	 swiz = ir->mask.z;
1498	 break;
1499      case 3:
1500	 swiz = ir->mask.w;
1501	 break;
1502      }
1503
1504      channel.reg_offset += swiz;
1505      emit(fs_inst(BRW_OPCODE_MOV, result, channel));
1506      result.reg_offset++;
1507   }
1508}
1509
1510void
1511fs_visitor::visit(ir_discard *ir)
1512{
1513   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1514
1515   assert(ir->condition == NULL); /* FINISHME */
1516
1517   emit(fs_inst(FS_OPCODE_DISCARD, temp, temp));
1518   kill_emitted = true;
1519}
1520
1521void
1522fs_visitor::visit(ir_constant *ir)
1523{
1524   fs_reg reg(this, ir->type);
1525   this->result = reg;
1526
1527   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1528      switch (ir->type->base_type) {
1529      case GLSL_TYPE_FLOAT:
1530	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.f[i])));
1531	 break;
1532      case GLSL_TYPE_UINT:
1533	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.u[i])));
1534	 break;
1535      case GLSL_TYPE_INT:
1536	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.i[i])));
1537	 break;
1538      case GLSL_TYPE_BOOL:
1539	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg((int)ir->value.b[i])));
1540	 break;
1541      default:
1542	 assert(!"Non-float/uint/int/bool constant");
1543      }
1544      reg.reg_offset++;
1545   }
1546}
1547
1548void
1549fs_visitor::visit(ir_if *ir)
1550{
1551   fs_inst *inst;
1552
1553   /* Don't point the annotation at the if statement, because then it plus
1554    * the then and else blocks get printed.
1555    */
1556   this->base_ir = ir->condition;
1557
1558   /* Generate the condition into the condition code. */
1559   ir->condition->accept(this);
1560   inst = emit(fs_inst(BRW_OPCODE_MOV, fs_reg(brw_null_reg()), this->result));
1561   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1562
1563   inst = emit(fs_inst(BRW_OPCODE_IF));
1564   inst->predicated = true;
1565
1566   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1567      ir_instruction *ir = (ir_instruction *)iter.get();
1568      this->base_ir = ir;
1569
1570      ir->accept(this);
1571   }
1572
1573   if (!ir->else_instructions.is_empty()) {
1574      emit(fs_inst(BRW_OPCODE_ELSE));
1575
1576      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1577	 ir_instruction *ir = (ir_instruction *)iter.get();
1578	 this->base_ir = ir;
1579
1580	 ir->accept(this);
1581      }
1582   }
1583
1584   emit(fs_inst(BRW_OPCODE_ENDIF));
1585}
1586
1587void
1588fs_visitor::visit(ir_loop *ir)
1589{
1590   fs_reg counter = reg_undef;
1591
1592   if (ir->counter) {
1593      this->base_ir = ir->counter;
1594      ir->counter->accept(this);
1595      counter = *(variable_storage(ir->counter));
1596
1597      if (ir->from) {
1598	 this->base_ir = ir->from;
1599	 ir->from->accept(this);
1600
1601	 emit(fs_inst(BRW_OPCODE_MOV, counter, this->result));
1602      }
1603   }
1604
1605   emit(fs_inst(BRW_OPCODE_DO));
1606
1607   if (ir->to) {
1608      this->base_ir = ir->to;
1609      ir->to->accept(this);
1610
1611      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null,
1612				   counter, this->result));
1613      switch (ir->cmp) {
1614      case ir_binop_equal:
1615	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1616	 break;
1617      case ir_binop_nequal:
1618	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1619	 break;
1620      case ir_binop_gequal:
1621	 inst->conditional_mod = BRW_CONDITIONAL_GE;
1622	 break;
1623      case ir_binop_lequal:
1624	 inst->conditional_mod = BRW_CONDITIONAL_LE;
1625	 break;
1626      case ir_binop_greater:
1627	 inst->conditional_mod = BRW_CONDITIONAL_G;
1628	 break;
1629      case ir_binop_less:
1630	 inst->conditional_mod = BRW_CONDITIONAL_L;
1631	 break;
1632      default:
1633	 assert(!"not reached: unknown loop condition");
1634	 this->fail = true;
1635	 break;
1636      }
1637
1638      inst = emit(fs_inst(BRW_OPCODE_BREAK));
1639      inst->predicated = true;
1640   }
1641
1642   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1643      ir_instruction *ir = (ir_instruction *)iter.get();
1644
1645      this->base_ir = ir;
1646      ir->accept(this);
1647   }
1648
1649   if (ir->increment) {
1650      this->base_ir = ir->increment;
1651      ir->increment->accept(this);
1652      emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result));
1653   }
1654
1655   emit(fs_inst(BRW_OPCODE_WHILE));
1656}
1657
1658void
1659fs_visitor::visit(ir_loop_jump *ir)
1660{
1661   switch (ir->mode) {
1662   case ir_loop_jump::jump_break:
1663      emit(fs_inst(BRW_OPCODE_BREAK));
1664      break;
1665   case ir_loop_jump::jump_continue:
1666      emit(fs_inst(BRW_OPCODE_CONTINUE));
1667      break;
1668   }
1669}
1670
1671void
1672fs_visitor::visit(ir_call *ir)
1673{
1674   assert(!"FINISHME");
1675}
1676
1677void
1678fs_visitor::visit(ir_return *ir)
1679{
1680   assert(!"FINISHME");
1681}
1682
1683void
1684fs_visitor::visit(ir_function *ir)
1685{
1686   /* Ignore function bodies other than main() -- we shouldn't see calls to
1687    * them since they should all be inlined before we get to ir_to_mesa.
1688    */
1689   if (strcmp(ir->name, "main") == 0) {
1690      const ir_function_signature *sig;
1691      exec_list empty;
1692
1693      sig = ir->matching_signature(&empty);
1694
1695      assert(sig);
1696
1697      foreach_iter(exec_list_iterator, iter, sig->body) {
1698	 ir_instruction *ir = (ir_instruction *)iter.get();
1699	 this->base_ir = ir;
1700
1701	 ir->accept(this);
1702      }
1703   }
1704}
1705
1706void
1707fs_visitor::visit(ir_function_signature *ir)
1708{
1709   assert(!"not reached");
1710   (void)ir;
1711}
1712
1713fs_inst *
1714fs_visitor::emit(fs_inst inst)
1715{
1716   fs_inst *list_inst = new(mem_ctx) fs_inst;
1717   *list_inst = inst;
1718
1719   list_inst->annotation = this->current_annotation;
1720   list_inst->ir = this->base_ir;
1721
1722   this->instructions.push_tail(list_inst);
1723
1724   return list_inst;
1725}
1726
1727/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1728void
1729fs_visitor::emit_dummy_fs()
1730{
1731   /* Everyone's favorite color. */
1732   emit(fs_inst(BRW_OPCODE_MOV,
1733		fs_reg(MRF, 2),
1734		fs_reg(1.0f)));
1735   emit(fs_inst(BRW_OPCODE_MOV,
1736		fs_reg(MRF, 3),
1737		fs_reg(0.0f)));
1738   emit(fs_inst(BRW_OPCODE_MOV,
1739		fs_reg(MRF, 4),
1740		fs_reg(1.0f)));
1741   emit(fs_inst(BRW_OPCODE_MOV,
1742		fs_reg(MRF, 5),
1743		fs_reg(0.0f)));
1744
1745   fs_inst *write;
1746   write = emit(fs_inst(FS_OPCODE_FB_WRITE,
1747			fs_reg(0),
1748			fs_reg(0)));
1749}
1750
1751/* The register location here is relative to the start of the URB
1752 * data.  It will get adjusted to be a real location before
1753 * generate_code() time.
1754 */
1755struct brw_reg
1756fs_visitor::interp_reg(int location, int channel)
1757{
1758   int regnr = urb_setup[location] * 2 + channel / 2;
1759   int stride = (channel & 1) * 4;
1760
1761   assert(urb_setup[location] != -1);
1762
1763   return brw_vec1_grf(regnr, stride);
1764}
1765
1766/** Emits the interpolation for the varying inputs. */
1767void
1768fs_visitor::emit_interpolation_setup_gen4()
1769{
1770   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1771
1772   this->current_annotation = "compute pixel centers";
1773   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1774   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1775   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1776   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1777   emit(fs_inst(BRW_OPCODE_ADD,
1778		this->pixel_x,
1779		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1780		fs_reg(brw_imm_v(0x10101010))));
1781   emit(fs_inst(BRW_OPCODE_ADD,
1782		this->pixel_y,
1783		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1784		fs_reg(brw_imm_v(0x11001100))));
1785
1786   this->current_annotation = "compute pixel deltas from v0";
1787   if (brw->has_pln) {
1788      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1789      this->delta_y = this->delta_x;
1790      this->delta_y.reg_offset++;
1791   } else {
1792      this->delta_x = fs_reg(this, glsl_type::float_type);
1793      this->delta_y = fs_reg(this, glsl_type::float_type);
1794   }
1795   emit(fs_inst(BRW_OPCODE_ADD,
1796		this->delta_x,
1797		this->pixel_x,
1798		fs_reg(negate(brw_vec1_grf(1, 0)))));
1799   emit(fs_inst(BRW_OPCODE_ADD,
1800		this->delta_y,
1801		this->pixel_y,
1802		fs_reg(negate(brw_vec1_grf(1, 1)))));
1803
1804   this->current_annotation = "compute pos.w and 1/pos.w";
1805   /* Compute wpos.w.  It's always in our setup, since it's needed to
1806    * interpolate the other attributes.
1807    */
1808   this->wpos_w = fs_reg(this, glsl_type::float_type);
1809   emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1810		interp_reg(FRAG_ATTRIB_WPOS, 3)));
1811   /* Compute the pixel 1/W value from wpos.w. */
1812   this->pixel_w = fs_reg(this, glsl_type::float_type);
1813   emit(fs_inst(FS_OPCODE_RCP, this->pixel_w, wpos_w));
1814   this->current_annotation = NULL;
1815}
1816
1817/** Emits the interpolation for the varying inputs. */
1818void
1819fs_visitor::emit_interpolation_setup_gen6()
1820{
1821   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1822
1823   /* If the pixel centers end up used, the setup is the same as for gen4. */
1824   this->current_annotation = "compute pixel centers";
1825   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1826   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1827   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1828   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1829   emit(fs_inst(BRW_OPCODE_ADD,
1830		this->pixel_x,
1831		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1832		fs_reg(brw_imm_v(0x10101010))));
1833   emit(fs_inst(BRW_OPCODE_ADD,
1834		this->pixel_y,
1835		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1836		fs_reg(brw_imm_v(0x11001100))));
1837
1838   this->current_annotation = "compute 1/pos.w";
1839   this->wpos_w = fs_reg(brw_vec8_grf(c->key.source_w_reg, 0));
1840   this->pixel_w = fs_reg(this, glsl_type::float_type);
1841   emit(fs_inst(FS_OPCODE_RCP, this->pixel_w, wpos_w));
1842
1843   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
1844   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
1845
1846   this->current_annotation = NULL;
1847}
1848
1849void
1850fs_visitor::emit_fb_writes()
1851{
1852   this->current_annotation = "FB write header";
1853   GLboolean header_present = GL_TRUE;
1854   int nr = 0;
1855
1856   if (intel->gen >= 6 &&
1857       !this->kill_emitted &&
1858       c->key.nr_color_regions == 1) {
1859      header_present = false;
1860   }
1861
1862   if (header_present) {
1863      /* m0, m1 header */
1864      nr += 2;
1865   }
1866
1867   if (c->key.aa_dest_stencil_reg) {
1868      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1869		   fs_reg(brw_vec8_grf(c->key.aa_dest_stencil_reg, 0))));
1870   }
1871
1872   /* Reserve space for color. It'll be filled in per MRT below. */
1873   int color_mrf = nr;
1874   nr += 4;
1875
1876   if (c->key.source_depth_to_render_target) {
1877      if (c->key.computes_depth) {
1878	 /* Hand over gl_FragDepth. */
1879	 assert(this->frag_depth);
1880	 fs_reg depth = *(variable_storage(this->frag_depth));
1881
1882	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth));
1883      } else {
1884	 /* Pass through the payload depth. */
1885	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1886		      fs_reg(brw_vec8_grf(c->key.source_depth_reg, 0))));
1887      }
1888   }
1889
1890   if (c->key.dest_depth_reg) {
1891      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1892		   fs_reg(brw_vec8_grf(c->key.dest_depth_reg, 0))));
1893   }
1894
1895   fs_reg color = reg_undef;
1896   if (this->frag_color)
1897      color = *(variable_storage(this->frag_color));
1898   else if (this->frag_data)
1899      color = *(variable_storage(this->frag_data));
1900
1901   for (int target = 0; target < c->key.nr_color_regions; target++) {
1902      this->current_annotation = talloc_asprintf(this->mem_ctx,
1903						 "FB write target %d",
1904						 target);
1905      if (this->frag_color || this->frag_data) {
1906	 for (int i = 0; i < 4; i++) {
1907	    emit(fs_inst(BRW_OPCODE_MOV,
1908			 fs_reg(MRF, color_mrf + i),
1909			 color));
1910	    color.reg_offset++;
1911	 }
1912      }
1913
1914      if (this->frag_color)
1915	 color.reg_offset -= 4;
1916
1917      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
1918				   reg_undef, reg_undef));
1919      inst->target = target;
1920      inst->mlen = nr;
1921      if (target == c->key.nr_color_regions - 1)
1922	 inst->eot = true;
1923   }
1924
1925   if (c->key.nr_color_regions == 0) {
1926      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
1927				   reg_undef, reg_undef));
1928      inst->mlen = nr;
1929      inst->eot = true;
1930      inst->header_present = header_present;
1931   }
1932
1933   this->current_annotation = NULL;
1934}
1935
1936void
1937fs_visitor::generate_fb_write(fs_inst *inst)
1938{
1939   GLboolean eot = inst->eot;
1940   struct brw_reg implied_header;
1941
1942   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
1943    * move, here's g1.
1944    */
1945   brw_push_insn_state(p);
1946   brw_set_mask_control(p, BRW_MASK_DISABLE);
1947   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1948
1949   if (inst->header_present) {
1950      if (intel->gen >= 6) {
1951	 brw_MOV(p,
1952		 brw_message_reg(0),
1953		 brw_vec8_grf(0, 0));
1954	 implied_header = brw_null_reg();
1955      } else {
1956	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
1957      }
1958
1959      brw_MOV(p,
1960	      brw_message_reg(1),
1961	      brw_vec8_grf(1, 0));
1962   } else {
1963      implied_header = brw_null_reg();
1964   }
1965
1966   brw_pop_insn_state(p);
1967
1968   brw_fb_WRITE(p,
1969		8, /* dispatch_width */
1970		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
1971		0, /* base MRF */
1972		implied_header,
1973		inst->target,
1974		inst->mlen,
1975		0,
1976		eot);
1977}
1978
1979void
1980fs_visitor::generate_linterp(fs_inst *inst,
1981			     struct brw_reg dst, struct brw_reg *src)
1982{
1983   struct brw_reg delta_x = src[0];
1984   struct brw_reg delta_y = src[1];
1985   struct brw_reg interp = src[2];
1986
1987   if (brw->has_pln &&
1988       delta_y.nr == delta_x.nr + 1 &&
1989       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
1990      brw_PLN(p, dst, interp, delta_x);
1991   } else {
1992      brw_LINE(p, brw_null_reg(), interp, delta_x);
1993      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
1994   }
1995}
1996
1997void
1998fs_visitor::generate_math(fs_inst *inst,
1999			  struct brw_reg dst, struct brw_reg *src)
2000{
2001   int op;
2002
2003   switch (inst->opcode) {
2004   case FS_OPCODE_RCP:
2005      op = BRW_MATH_FUNCTION_INV;
2006      break;
2007   case FS_OPCODE_RSQ:
2008      op = BRW_MATH_FUNCTION_RSQ;
2009      break;
2010   case FS_OPCODE_SQRT:
2011      op = BRW_MATH_FUNCTION_SQRT;
2012      break;
2013   case FS_OPCODE_EXP2:
2014      op = BRW_MATH_FUNCTION_EXP;
2015      break;
2016   case FS_OPCODE_LOG2:
2017      op = BRW_MATH_FUNCTION_LOG;
2018      break;
2019   case FS_OPCODE_POW:
2020      op = BRW_MATH_FUNCTION_POW;
2021      break;
2022   case FS_OPCODE_SIN:
2023      op = BRW_MATH_FUNCTION_SIN;
2024      break;
2025   case FS_OPCODE_COS:
2026      op = BRW_MATH_FUNCTION_COS;
2027      break;
2028   default:
2029      assert(!"not reached: unknown math function");
2030      op = 0;
2031      break;
2032   }
2033
2034   if (inst->opcode == FS_OPCODE_POW) {
2035      brw_MOV(p, brw_message_reg(3), src[1]);
2036   }
2037
2038   brw_math(p, dst,
2039	    op,
2040	    inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2041	    BRW_MATH_SATURATE_NONE,
2042	    2, src[0],
2043	    BRW_MATH_DATA_VECTOR,
2044	    BRW_MATH_PRECISION_FULL);
2045}
2046
2047void
2048fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2049{
2050   int msg_type = -1;
2051   int rlen = 4;
2052   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2053
2054   if (intel->gen == 5) {
2055      switch (inst->opcode) {
2056      case FS_OPCODE_TEX:
2057	 if (inst->shadow_compare) {
2058	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
2059	 } else {
2060	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
2061	 }
2062	 break;
2063      case FS_OPCODE_TXB:
2064	 if (inst->shadow_compare) {
2065	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE_GEN5;
2066	 } else {
2067	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
2068	 }
2069	 break;
2070      }
2071   } else {
2072      switch (inst->opcode) {
2073      case FS_OPCODE_TEX:
2074	 /* Note that G45 and older determines shadow compare and dispatch width
2075	  * from message length for most messages.
2076	  */
2077	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2078	 if (inst->shadow_compare) {
2079	    assert(inst->mlen == 5);
2080	 } else {
2081	    assert(inst->mlen <= 6);
2082	 }
2083	 break;
2084      case FS_OPCODE_TXB:
2085	 if (inst->shadow_compare) {
2086	    assert(inst->mlen == 5);
2087	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2088	 } else {
2089	    assert(inst->mlen == 8);
2090	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2091	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2092	 }
2093	 break;
2094      }
2095   }
2096   assert(msg_type != -1);
2097
2098   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2099      rlen = 8;
2100      dst = vec16(dst);
2101   }
2102
2103   /* g0 header. */
2104   src.nr--;
2105
2106   brw_SAMPLE(p,
2107	      retype(dst, BRW_REGISTER_TYPE_UW),
2108	      src.nr,
2109	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
2110              SURF_INDEX_TEXTURE(inst->sampler),
2111	      inst->sampler,
2112	      WRITEMASK_XYZW,
2113	      msg_type,
2114	      rlen,
2115	      inst->mlen + 1,
2116	      0,
2117	      1,
2118	      simd_mode);
2119}
2120
2121
2122/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2123 * looking like:
2124 *
2125 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2126 *
2127 * and we're trying to produce:
2128 *
2129 *           DDX                     DDY
2130 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2131 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2132 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2133 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2134 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2135 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2136 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2137 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2138 *
2139 * and add another set of two more subspans if in 16-pixel dispatch mode.
2140 *
2141 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2142 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2143 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2144 * between each other.  We could probably do it like ddx and swizzle the right
2145 * order later, but bail for now and just produce
2146 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2147 */
2148void
2149fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2150{
2151   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2152				 BRW_REGISTER_TYPE_F,
2153				 BRW_VERTICAL_STRIDE_2,
2154				 BRW_WIDTH_2,
2155				 BRW_HORIZONTAL_STRIDE_0,
2156				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2157   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2158				 BRW_REGISTER_TYPE_F,
2159				 BRW_VERTICAL_STRIDE_2,
2160				 BRW_WIDTH_2,
2161				 BRW_HORIZONTAL_STRIDE_0,
2162				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2163   brw_ADD(p, dst, src0, negate(src1));
2164}
2165
2166void
2167fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2168{
2169   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2170				 BRW_REGISTER_TYPE_F,
2171				 BRW_VERTICAL_STRIDE_4,
2172				 BRW_WIDTH_4,
2173				 BRW_HORIZONTAL_STRIDE_0,
2174				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2175   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2176				 BRW_REGISTER_TYPE_F,
2177				 BRW_VERTICAL_STRIDE_4,
2178				 BRW_WIDTH_4,
2179				 BRW_HORIZONTAL_STRIDE_0,
2180				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2181   brw_ADD(p, dst, src0, negate(src1));
2182}
2183
2184void
2185fs_visitor::generate_discard(fs_inst *inst, struct brw_reg temp)
2186{
2187   struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2188   temp = brw_uw1_reg(temp.file, temp.nr, 0);
2189
2190   brw_push_insn_state(p);
2191   brw_set_mask_control(p, BRW_MASK_DISABLE);
2192   brw_NOT(p, temp, brw_mask_reg(1)); /* IMASK */
2193   brw_AND(p, g0, temp, g0);
2194   brw_pop_insn_state(p);
2195}
2196
2197void
2198fs_visitor::assign_curb_setup()
2199{
2200   c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
2201   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2202
2203   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2204   foreach_iter(exec_list_iterator, iter, this->instructions) {
2205      fs_inst *inst = (fs_inst *)iter.get();
2206
2207      for (unsigned int i = 0; i < 3; i++) {
2208	 if (inst->src[i].file == UNIFORM) {
2209	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2210	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2211						  constant_nr / 8,
2212						  constant_nr % 8);
2213
2214	    inst->src[i].file = FIXED_HW_REG;
2215	    inst->src[i].fixed_hw_reg = brw_reg;
2216	 }
2217      }
2218   }
2219}
2220
2221void
2222fs_visitor::calculate_urb_setup()
2223{
2224   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2225      urb_setup[i] = -1;
2226   }
2227
2228   int urb_next = 0;
2229   /* Figure out where each of the incoming setup attributes lands. */
2230   if (intel->gen >= 6) {
2231      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2232	 if (i == FRAG_ATTRIB_WPOS ||
2233	     (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i))) {
2234	    urb_setup[i] = urb_next++;
2235	 }
2236      }
2237   } else {
2238      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2239      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2240	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2241	    int fp_index;
2242
2243	    if (i >= VERT_RESULT_VAR0)
2244	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2245	    else if (i <= VERT_RESULT_TEX7)
2246	       fp_index = i;
2247	    else
2248	       fp_index = -1;
2249
2250	    if (fp_index >= 0)
2251	       urb_setup[fp_index] = urb_next++;
2252	 }
2253      }
2254   }
2255
2256   /* Each attribute is 4 setup channels, each of which is half a reg. */
2257   c->prog_data.urb_read_length = urb_next * 2;
2258}
2259
2260void
2261fs_visitor::assign_urb_setup()
2262{
2263   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2264
2265   /* Offset all the urb_setup[] index by the actual position of the
2266    * setup regs, now that the location of the constants has been chosen.
2267    */
2268   foreach_iter(exec_list_iterator, iter, this->instructions) {
2269      fs_inst *inst = (fs_inst *)iter.get();
2270
2271      if (inst->opcode != FS_OPCODE_LINTERP)
2272	 continue;
2273
2274      assert(inst->src[2].file == FIXED_HW_REG);
2275
2276      inst->src[2].fixed_hw_reg.nr += urb_start;
2277   }
2278
2279   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2280}
2281
2282static void
2283assign_reg(int *reg_hw_locations, fs_reg *reg)
2284{
2285   if (reg->file == GRF && reg->reg != 0) {
2286      reg->hw_reg = reg_hw_locations[reg->reg] + reg->reg_offset;
2287      reg->reg = 0;
2288   }
2289}
2290
2291void
2292fs_visitor::assign_regs_trivial()
2293{
2294   int last_grf = 0;
2295   int hw_reg_mapping[this->virtual_grf_next];
2296   int i;
2297
2298   hw_reg_mapping[0] = 0;
2299   hw_reg_mapping[1] = this->first_non_payload_grf;
2300   for (i = 2; i < this->virtual_grf_next; i++) {
2301      hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
2302			   this->virtual_grf_sizes[i - 1]);
2303   }
2304   last_grf = hw_reg_mapping[i - 1] + this->virtual_grf_sizes[i - 1];
2305
2306   foreach_iter(exec_list_iterator, iter, this->instructions) {
2307      fs_inst *inst = (fs_inst *)iter.get();
2308
2309      assign_reg(hw_reg_mapping, &inst->dst);
2310      assign_reg(hw_reg_mapping, &inst->src[0]);
2311      assign_reg(hw_reg_mapping, &inst->src[1]);
2312   }
2313
2314   this->grf_used = last_grf + 1;
2315}
2316
2317void
2318fs_visitor::assign_regs()
2319{
2320   int last_grf = 0;
2321   int hw_reg_mapping[this->virtual_grf_next + 1];
2322   int base_reg_count = BRW_MAX_GRF - this->first_non_payload_grf;
2323   int class_sizes[base_reg_count];
2324   int class_count = 0;
2325   int aligned_pair_class = -1;
2326
2327   /* Set up the register classes.
2328    *
2329    * The base registers store a scalar value.  For texture samples,
2330    * we get virtual GRFs composed of 4 contiguous hw register.  For
2331    * structures and arrays, we store them as contiguous larger things
2332    * than that, though we should be able to do better most of the
2333    * time.
2334    */
2335   class_sizes[class_count++] = 1;
2336   if (brw->has_pln && intel->gen < 6) {
2337      /* Always set up the (unaligned) pairs for gen5, so we can find
2338       * them for making the aligned pair class.
2339       */
2340      class_sizes[class_count++] = 2;
2341   }
2342   for (int r = 1; r < this->virtual_grf_next; r++) {
2343      int i;
2344
2345      for (i = 0; i < class_count; i++) {
2346	 if (class_sizes[i] == this->virtual_grf_sizes[r])
2347	    break;
2348      }
2349      if (i == class_count) {
2350	 if (this->virtual_grf_sizes[r] >= base_reg_count) {
2351	    fprintf(stderr, "Object too large to register allocate.\n");
2352	    this->fail = true;
2353	 }
2354
2355	 class_sizes[class_count++] = this->virtual_grf_sizes[r];
2356      }
2357   }
2358
2359   int ra_reg_count = 0;
2360   int class_base_reg[class_count];
2361   int class_reg_count[class_count];
2362   int classes[class_count + 1];
2363
2364   for (int i = 0; i < class_count; i++) {
2365      class_base_reg[i] = ra_reg_count;
2366      class_reg_count[i] = base_reg_count - (class_sizes[i] - 1);
2367      ra_reg_count += class_reg_count[i];
2368   }
2369
2370   struct ra_regs *regs = ra_alloc_reg_set(ra_reg_count);
2371   for (int i = 0; i < class_count; i++) {
2372      classes[i] = ra_alloc_reg_class(regs);
2373
2374      for (int i_r = 0; i_r < class_reg_count[i]; i_r++) {
2375	 ra_class_add_reg(regs, classes[i], class_base_reg[i] + i_r);
2376      }
2377
2378      /* Add conflicts between our contiguous registers aliasing
2379       * base regs and other register classes' contiguous registers
2380       * that alias base regs, or the base regs themselves for classes[0].
2381       */
2382      for (int c = 0; c <= i; c++) {
2383	 for (int i_r = 0; i_r < class_reg_count[i]; i_r++) {
2384	    for (int c_r = MAX2(0, i_r - (class_sizes[c] - 1));
2385		 c_r < MIN2(class_reg_count[c], i_r + class_sizes[i]);
2386		 c_r++) {
2387
2388	       if (0) {
2389		  printf("%d/%d conflicts %d/%d\n",
2390			 class_sizes[i], this->first_non_payload_grf + i_r,
2391			 class_sizes[c], this->first_non_payload_grf + c_r);
2392	       }
2393
2394	       ra_add_reg_conflict(regs,
2395				   class_base_reg[i] + i_r,
2396				   class_base_reg[c] + c_r);
2397	    }
2398	 }
2399      }
2400   }
2401
2402   /* Add a special class for aligned pairs, which we'll put delta_x/y
2403    * in on gen5 so that we can do PLN.
2404    */
2405   if (brw->has_pln && intel->gen < 6) {
2406      int reg_count = (base_reg_count - 1) / 2;
2407      int unaligned_pair_class = 1;
2408      assert(class_sizes[unaligned_pair_class] == 2);
2409
2410      aligned_pair_class = class_count;
2411      classes[aligned_pair_class] = ra_alloc_reg_class(regs);
2412      class_base_reg[aligned_pair_class] = 0;
2413      class_reg_count[aligned_pair_class] = 0;
2414      int start = (this->first_non_payload_grf & 1) ? 1 : 0;
2415
2416      for (int i = 0; i < reg_count; i++) {
2417	 ra_class_add_reg(regs, classes[aligned_pair_class],
2418			  class_base_reg[unaligned_pair_class] + i * 2 + start);
2419      }
2420      class_count++;
2421   }
2422
2423   ra_set_finalize(regs);
2424
2425   struct ra_graph *g = ra_alloc_interference_graph(regs,
2426						    this->virtual_grf_next);
2427   /* Node 0 is just a placeholder to keep virtual_grf[] mapping 1:1
2428    * with nodes.
2429    */
2430   ra_set_node_class(g, 0, classes[0]);
2431
2432   for (int i = 1; i < this->virtual_grf_next; i++) {
2433      for (int c = 0; c < class_count; c++) {
2434	 if (class_sizes[c] == this->virtual_grf_sizes[i]) {
2435	    if (aligned_pair_class >= 0 &&
2436		this->delta_x.reg == i) {
2437	       ra_set_node_class(g, i, classes[aligned_pair_class]);
2438	    } else {
2439	       ra_set_node_class(g, i, classes[c]);
2440	    }
2441	    break;
2442	 }
2443      }
2444
2445      for (int j = 1; j < i; j++) {
2446	 if (virtual_grf_interferes(i, j)) {
2447	    ra_add_node_interference(g, i, j);
2448	 }
2449      }
2450   }
2451
2452   /* FINISHME: Handle spilling */
2453   if (!ra_allocate_no_spills(g)) {
2454      fprintf(stderr, "Failed to allocate registers.\n");
2455      this->fail = true;
2456      return;
2457   }
2458
2459   /* Get the chosen virtual registers for each node, and map virtual
2460    * regs in the register classes back down to real hardware reg
2461    * numbers.
2462    */
2463   hw_reg_mapping[0] = 0; /* unused */
2464   for (int i = 1; i < this->virtual_grf_next; i++) {
2465      int reg = ra_get_node_reg(g, i);
2466      int hw_reg = -1;
2467
2468      for (int c = 0; c < class_count; c++) {
2469	 if (reg >= class_base_reg[c] &&
2470	     reg < class_base_reg[c] + class_reg_count[c]) {
2471	    hw_reg = reg - class_base_reg[c];
2472	    break;
2473	 }
2474      }
2475
2476      assert(hw_reg != -1);
2477      hw_reg_mapping[i] = this->first_non_payload_grf + hw_reg;
2478      last_grf = MAX2(last_grf,
2479		      hw_reg_mapping[i] + this->virtual_grf_sizes[i] - 1);
2480   }
2481
2482   foreach_iter(exec_list_iterator, iter, this->instructions) {
2483      fs_inst *inst = (fs_inst *)iter.get();
2484
2485      assign_reg(hw_reg_mapping, &inst->dst);
2486      assign_reg(hw_reg_mapping, &inst->src[0]);
2487      assign_reg(hw_reg_mapping, &inst->src[1]);
2488   }
2489
2490   this->grf_used = last_grf + 1;
2491
2492   talloc_free(g);
2493   talloc_free(regs);
2494}
2495
2496void
2497fs_visitor::calculate_live_intervals()
2498{
2499   int num_vars = this->virtual_grf_next;
2500   int *def = talloc_array(mem_ctx, int, num_vars);
2501   int *use = talloc_array(mem_ctx, int, num_vars);
2502   int loop_depth = 0;
2503   int loop_start = 0;
2504
2505   for (int i = 0; i < num_vars; i++) {
2506      def[i] = 1 << 30;
2507      use[i] = -1;
2508   }
2509
2510   int ip = 0;
2511   foreach_iter(exec_list_iterator, iter, this->instructions) {
2512      fs_inst *inst = (fs_inst *)iter.get();
2513
2514      if (inst->opcode == BRW_OPCODE_DO) {
2515	 if (loop_depth++ == 0)
2516	    loop_start = ip;
2517      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2518	 loop_depth--;
2519
2520	 if (loop_depth == 0) {
2521	    /* FINISHME:
2522	     *
2523	     * Patches up any vars marked for use within the loop as
2524	     * live until the end.  This is conservative, as there
2525	     * will often be variables defined and used inside the
2526	     * loop but dead at the end of the loop body.
2527	     */
2528	    for (int i = 0; i < num_vars; i++) {
2529	       if (use[i] == loop_start) {
2530		  use[i] = ip;
2531	       }
2532	    }
2533	 }
2534      } else {
2535	 int eip = ip;
2536
2537	 if (loop_depth)
2538	    eip = loop_start;
2539
2540	 for (unsigned int i = 0; i < 3; i++) {
2541	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2542	       use[inst->src[i].reg] = MAX2(use[inst->src[i].reg], eip);
2543	    }
2544	 }
2545	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2546	    def[inst->dst.reg] = MIN2(def[inst->dst.reg], eip);
2547	 }
2548      }
2549
2550      ip++;
2551   }
2552
2553   talloc_free(this->virtual_grf_def);
2554   talloc_free(this->virtual_grf_use);
2555   this->virtual_grf_def = def;
2556   this->virtual_grf_use = use;
2557}
2558
2559/**
2560 * Attempts to move immediate constants into the immediate
2561 * constant slot of following instructions.
2562 *
2563 * Immediate constants are a bit tricky -- they have to be in the last
2564 * operand slot, you can't do abs/negate on them,
2565 */
2566
2567bool
2568fs_visitor::propagate_constants()
2569{
2570   bool progress = false;
2571
2572   foreach_iter(exec_list_iterator, iter, this->instructions) {
2573      fs_inst *inst = (fs_inst *)iter.get();
2574
2575      if (inst->opcode != BRW_OPCODE_MOV ||
2576	  inst->predicated ||
2577	  inst->dst.file != GRF || inst->src[0].file != IMM ||
2578	  inst->dst.type != inst->src[0].type)
2579	 continue;
2580
2581      /* Don't bother with cases where we should have had the
2582       * operation on the constant folded in GLSL already.
2583       */
2584      if (inst->saturate)
2585	 continue;
2586
2587      /* Found a move of a constant to a GRF.  Find anything else using the GRF
2588       * before it's written, and replace it with the constant if we can.
2589       */
2590      exec_list_iterator scan_iter = iter;
2591      scan_iter.next();
2592      for (; scan_iter.has_next(); scan_iter.next()) {
2593	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2594
2595	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2596	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2597	     scan_inst->opcode == BRW_OPCODE_ELSE ||
2598	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2599	    break;
2600	 }
2601
2602	 for (int i = 2; i >= 0; i--) {
2603	    if (scan_inst->src[i].file != GRF ||
2604		scan_inst->src[i].reg != inst->dst.reg ||
2605		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2606	       continue;
2607
2608	    /* Don't bother with cases where we should have had the
2609	     * operation on the constant folded in GLSL already.
2610	     */
2611	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2612	       continue;
2613
2614	    switch (scan_inst->opcode) {
2615	    case BRW_OPCODE_MOV:
2616	       scan_inst->src[i] = inst->src[0];
2617	       progress = true;
2618	       break;
2619
2620	    case BRW_OPCODE_MUL:
2621	    case BRW_OPCODE_ADD:
2622	       if (i == 1) {
2623		  scan_inst->src[i] = inst->src[0];
2624		  progress = true;
2625	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
2626		  /* Fit this constant in by commuting the operands */
2627		  scan_inst->src[0] = scan_inst->src[1];
2628		  scan_inst->src[1] = inst->src[0];
2629	       }
2630	       break;
2631	    case BRW_OPCODE_CMP:
2632	       if (i == 1) {
2633		  scan_inst->src[i] = inst->src[0];
2634		  progress = true;
2635	       }
2636	    }
2637	 }
2638
2639	 if (scan_inst->dst.file == GRF &&
2640	     scan_inst->dst.reg == inst->dst.reg &&
2641	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2642	      scan_inst->opcode == FS_OPCODE_TEX)) {
2643	    break;
2644	 }
2645      }
2646   }
2647
2648   return progress;
2649}
2650/**
2651 * Must be called after calculate_live_intervales() to remove unused
2652 * writes to registers -- register allocation will fail otherwise
2653 * because something deffed but not used won't be considered to
2654 * interfere with other regs.
2655 */
2656bool
2657fs_visitor::dead_code_eliminate()
2658{
2659   bool progress = false;
2660   int num_vars = this->virtual_grf_next;
2661   bool dead[num_vars];
2662
2663   for (int i = 0; i < num_vars; i++) {
2664      /* This would be ">=", but FS_OPCODE_DISCARD has a src == dst where
2665       * it writes dst then reads it as src.
2666       */
2667      dead[i] = this->virtual_grf_def[i] > this->virtual_grf_use[i];
2668
2669      if (dead[i]) {
2670	 /* Mark off its interval so it won't interfere with anything. */
2671	 this->virtual_grf_def[i] = -1;
2672	 this->virtual_grf_use[i] = -1;
2673      }
2674   }
2675
2676   foreach_iter(exec_list_iterator, iter, this->instructions) {
2677      fs_inst *inst = (fs_inst *)iter.get();
2678
2679      if (inst->dst.file == GRF && dead[inst->dst.reg]) {
2680	 inst->remove();
2681	 progress = true;
2682      }
2683   }
2684
2685   return progress;
2686}
2687
2688bool
2689fs_visitor::virtual_grf_interferes(int a, int b)
2690{
2691   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
2692   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
2693
2694   /* For dead code, just check if the def interferes with the other range. */
2695   if (this->virtual_grf_use[a] == -1) {
2696      return (this->virtual_grf_def[a] >= this->virtual_grf_def[b] &&
2697	      this->virtual_grf_def[a] < this->virtual_grf_use[b]);
2698   }
2699   if (this->virtual_grf_use[b] == -1) {
2700      return (this->virtual_grf_def[b] >= this->virtual_grf_def[a] &&
2701	      this->virtual_grf_def[b] < this->virtual_grf_use[a]);
2702   }
2703
2704   return start <= end;
2705}
2706
2707static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
2708{
2709   struct brw_reg brw_reg;
2710
2711   switch (reg->file) {
2712   case GRF:
2713   case ARF:
2714   case MRF:
2715      brw_reg = brw_vec8_reg(reg->file,
2716			    reg->hw_reg, 0);
2717      brw_reg = retype(brw_reg, reg->type);
2718      break;
2719   case IMM:
2720      switch (reg->type) {
2721      case BRW_REGISTER_TYPE_F:
2722	 brw_reg = brw_imm_f(reg->imm.f);
2723	 break;
2724      case BRW_REGISTER_TYPE_D:
2725	 brw_reg = brw_imm_d(reg->imm.i);
2726	 break;
2727      case BRW_REGISTER_TYPE_UD:
2728	 brw_reg = brw_imm_ud(reg->imm.u);
2729	 break;
2730      default:
2731	 assert(!"not reached");
2732	 break;
2733      }
2734      break;
2735   case FIXED_HW_REG:
2736      brw_reg = reg->fixed_hw_reg;
2737      break;
2738   case BAD_FILE:
2739      /* Probably unused. */
2740      brw_reg = brw_null_reg();
2741      break;
2742   case UNIFORM:
2743      assert(!"not reached");
2744      brw_reg = brw_null_reg();
2745      break;
2746   }
2747   if (reg->abs)
2748      brw_reg = brw_abs(brw_reg);
2749   if (reg->negate)
2750      brw_reg = negate(brw_reg);
2751
2752   return brw_reg;
2753}
2754
2755void
2756fs_visitor::generate_code()
2757{
2758   unsigned int annotation_len = 0;
2759   int last_native_inst = 0;
2760   struct brw_instruction *if_stack[16], *loop_stack[16];
2761   int if_stack_depth = 0, loop_stack_depth = 0;
2762   int if_depth_in_loop[16];
2763
2764   if_depth_in_loop[loop_stack_depth] = 0;
2765
2766   memset(&if_stack, 0, sizeof(if_stack));
2767   foreach_iter(exec_list_iterator, iter, this->instructions) {
2768      fs_inst *inst = (fs_inst *)iter.get();
2769      struct brw_reg src[3], dst;
2770
2771      for (unsigned int i = 0; i < 3; i++) {
2772	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
2773      }
2774      dst = brw_reg_from_fs_reg(&inst->dst);
2775
2776      brw_set_conditionalmod(p, inst->conditional_mod);
2777      brw_set_predicate_control(p, inst->predicated);
2778
2779      switch (inst->opcode) {
2780      case BRW_OPCODE_MOV:
2781	 brw_MOV(p, dst, src[0]);
2782	 break;
2783      case BRW_OPCODE_ADD:
2784	 brw_ADD(p, dst, src[0], src[1]);
2785	 break;
2786      case BRW_OPCODE_MUL:
2787	 brw_MUL(p, dst, src[0], src[1]);
2788	 break;
2789
2790      case BRW_OPCODE_FRC:
2791	 brw_FRC(p, dst, src[0]);
2792	 break;
2793      case BRW_OPCODE_RNDD:
2794	 brw_RNDD(p, dst, src[0]);
2795	 break;
2796      case BRW_OPCODE_RNDZ:
2797	 brw_RNDZ(p, dst, src[0]);
2798	 break;
2799
2800      case BRW_OPCODE_AND:
2801	 brw_AND(p, dst, src[0], src[1]);
2802	 break;
2803      case BRW_OPCODE_OR:
2804	 brw_OR(p, dst, src[0], src[1]);
2805	 break;
2806      case BRW_OPCODE_XOR:
2807	 brw_XOR(p, dst, src[0], src[1]);
2808	 break;
2809
2810      case BRW_OPCODE_CMP:
2811	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
2812	 break;
2813      case BRW_OPCODE_SEL:
2814	 brw_SEL(p, dst, src[0], src[1]);
2815	 break;
2816
2817      case BRW_OPCODE_IF:
2818	 assert(if_stack_depth < 16);
2819	 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
2820	 if_depth_in_loop[loop_stack_depth]++;
2821	 if_stack_depth++;
2822	 break;
2823      case BRW_OPCODE_ELSE:
2824	 if_stack[if_stack_depth - 1] =
2825	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
2826	 break;
2827      case BRW_OPCODE_ENDIF:
2828	 if_stack_depth--;
2829	 brw_ENDIF(p , if_stack[if_stack_depth]);
2830	 if_depth_in_loop[loop_stack_depth]--;
2831	 break;
2832
2833      case BRW_OPCODE_DO:
2834	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
2835	 if_depth_in_loop[loop_stack_depth] = 0;
2836	 break;
2837
2838      case BRW_OPCODE_BREAK:
2839	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
2840	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2841	 break;
2842      case BRW_OPCODE_CONTINUE:
2843	 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
2844	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2845	 break;
2846
2847      case BRW_OPCODE_WHILE: {
2848	 struct brw_instruction *inst0, *inst1;
2849	 GLuint br = 1;
2850
2851	 if (intel->gen >= 5)
2852	    br = 2;
2853
2854	 assert(loop_stack_depth > 0);
2855	 loop_stack_depth--;
2856	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
2857	 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2858	 while (inst0 > loop_stack[loop_stack_depth]) {
2859	    inst0--;
2860	    if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2861		inst0->bits3.if_else.jump_count == 0) {
2862	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2863	    }
2864	    else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2865		     inst0->bits3.if_else.jump_count == 0) {
2866	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2867	    }
2868	 }
2869      }
2870	 break;
2871
2872      case FS_OPCODE_RCP:
2873      case FS_OPCODE_RSQ:
2874      case FS_OPCODE_SQRT:
2875      case FS_OPCODE_EXP2:
2876      case FS_OPCODE_LOG2:
2877      case FS_OPCODE_POW:
2878      case FS_OPCODE_SIN:
2879      case FS_OPCODE_COS:
2880	 generate_math(inst, dst, src);
2881	 break;
2882      case FS_OPCODE_LINTERP:
2883	 generate_linterp(inst, dst, src);
2884	 break;
2885      case FS_OPCODE_TEX:
2886      case FS_OPCODE_TXB:
2887      case FS_OPCODE_TXL:
2888	 generate_tex(inst, dst, src[0]);
2889	 break;
2890      case FS_OPCODE_DISCARD:
2891	 generate_discard(inst, dst /* src0 == dst */);
2892	 break;
2893      case FS_OPCODE_DDX:
2894	 generate_ddx(inst, dst, src[0]);
2895	 break;
2896      case FS_OPCODE_DDY:
2897	 generate_ddy(inst, dst, src[0]);
2898	 break;
2899      case FS_OPCODE_FB_WRITE:
2900	 generate_fb_write(inst);
2901	 break;
2902      default:
2903	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
2904	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
2905			  brw_opcodes[inst->opcode].name);
2906	 } else {
2907	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
2908	 }
2909	 this->fail = true;
2910      }
2911
2912      if (annotation_len < p->nr_insn) {
2913	 annotation_len *= 2;
2914	 if (annotation_len < 16)
2915	    annotation_len = 16;
2916
2917	 this->annotation_string = talloc_realloc(this->mem_ctx,
2918						  annotation_string,
2919						  const char *,
2920						  annotation_len);
2921	 this->annotation_ir = talloc_realloc(this->mem_ctx,
2922					      annotation_ir,
2923					      ir_instruction *,
2924					      annotation_len);
2925      }
2926
2927      for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
2928	 this->annotation_string[i] = inst->annotation;
2929	 this->annotation_ir[i] = inst->ir;
2930      }
2931      last_native_inst = p->nr_insn;
2932   }
2933}
2934
2935GLboolean
2936brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
2937{
2938   struct brw_compile *p = &c->func;
2939   struct intel_context *intel = &brw->intel;
2940   GLcontext *ctx = &intel->ctx;
2941   struct brw_shader *shader = NULL;
2942   struct gl_shader_program *prog = ctx->Shader.CurrentProgram;
2943
2944   if (!prog)
2945      return GL_FALSE;
2946
2947   if (!using_new_fs)
2948      return GL_FALSE;
2949
2950   for (unsigned int i = 0; i < prog->_NumLinkedShaders; i++) {
2951      if (prog->_LinkedShaders[i]->Type == GL_FRAGMENT_SHADER) {
2952	 shader = (struct brw_shader *)prog->_LinkedShaders[i];
2953	 break;
2954      }
2955   }
2956   if (!shader)
2957      return GL_FALSE;
2958
2959   /* We always use 8-wide mode, at least for now.  For one, flow
2960    * control only works in 8-wide.  Also, when we're fragment shader
2961    * bound, we're almost always under register pressure as well, so
2962    * 8-wide would save us from the performance cliff of spilling
2963    * regs.
2964    */
2965   c->dispatch_width = 8;
2966
2967   if (INTEL_DEBUG & DEBUG_WM) {
2968      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2969      _mesa_print_ir(shader->ir, NULL);
2970      printf("\n");
2971   }
2972
2973   /* Now the main event: Visit the shader IR and generate our FS IR for it.
2974    */
2975   fs_visitor v(c, shader);
2976
2977   if (0) {
2978      v.emit_dummy_fs();
2979   } else {
2980      v.calculate_urb_setup();
2981      if (intel->gen < 6)
2982	 v.emit_interpolation_setup_gen4();
2983      else
2984	 v.emit_interpolation_setup_gen6();
2985
2986      /* Generate FS IR for main().  (the visitor only descends into
2987       * functions called "main").
2988       */
2989      foreach_iter(exec_list_iterator, iter, *shader->ir) {
2990	 ir_instruction *ir = (ir_instruction *)iter.get();
2991	 v.base_ir = ir;
2992	 ir->accept(&v);
2993      }
2994
2995      v.emit_fb_writes();
2996      v.assign_curb_setup();
2997      v.assign_urb_setup();
2998
2999      bool progress;
3000      do {
3001	 progress = false;
3002
3003	 v.calculate_live_intervals();
3004	 progress = v.propagate_constants() || progress;
3005	 progress = v.dead_code_eliminate() || progress;
3006      } while (progress);
3007
3008      if (0)
3009	 v.assign_regs_trivial();
3010      else
3011	 v.assign_regs();
3012   }
3013
3014   if (!v.fail)
3015      v.generate_code();
3016
3017   assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
3018
3019   if (v.fail)
3020      return GL_FALSE;
3021
3022   if (INTEL_DEBUG & DEBUG_WM) {
3023      const char *last_annotation_string = NULL;
3024      ir_instruction *last_annotation_ir = NULL;
3025
3026      printf("Native code for fragment shader %d:\n", prog->Name);
3027      for (unsigned int i = 0; i < p->nr_insn; i++) {
3028	 if (last_annotation_ir != v.annotation_ir[i]) {
3029	    last_annotation_ir = v.annotation_ir[i];
3030	    if (last_annotation_ir) {
3031	       printf("   ");
3032	       last_annotation_ir->print();
3033	       printf("\n");
3034	    }
3035	 }
3036	 if (last_annotation_string != v.annotation_string[i]) {
3037	    last_annotation_string = v.annotation_string[i];
3038	    if (last_annotation_string)
3039	       printf("   %s\n", last_annotation_string);
3040	 }
3041	 brw_disasm(stdout, &p->store[i], intel->gen);
3042      }
3043      printf("\n");
3044   }
3045
3046   c->prog_data.total_grf = v.grf_used;
3047   c->prog_data.total_scratch = 0;
3048
3049   return GL_TRUE;
3050}
3051