brw_fs.cpp revision 9935fe705df44bb633039ca74332cc0c126ccc30
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44#include "talloc.h"
45}
46#include "brw_fs.h"
47#include "../glsl/glsl_types.h"
48#include "../glsl/ir_optimization.h"
49#include "../glsl/ir_print_visitor.h"
50
51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53struct gl_shader *
54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55{
56   struct brw_shader *shader;
57
58   shader = talloc_zero(NULL, struct brw_shader);
59   if (shader) {
60      shader->base.Type = type;
61      shader->base.Name = name;
62      _mesa_init_shader(ctx, &shader->base);
63   }
64
65   return &shader->base;
66}
67
68struct gl_shader_program *
69brw_new_shader_program(struct gl_context *ctx, GLuint name)
70{
71   struct brw_shader_program *prog;
72   prog = talloc_zero(NULL, struct brw_shader_program);
73   if (prog) {
74      prog->base.Name = name;
75      _mesa_init_shader_program(ctx, &prog->base);
76   }
77   return &prog->base;
78}
79
80GLboolean
81brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader)
82{
83   if (!_mesa_ir_compile_shader(ctx, shader))
84      return GL_FALSE;
85
86   return GL_TRUE;
87}
88
89GLboolean
90brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
91{
92   struct brw_shader *shader =
93      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
94   if (shader != NULL) {
95      void *mem_ctx = talloc_new(NULL);
96      bool progress;
97
98      if (shader->ir)
99	 talloc_free(shader->ir);
100      shader->ir = new(shader) exec_list;
101      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
102
103      do_mat_op_to_vec(shader->ir);
104      do_mod_to_fract(shader->ir);
105      do_div_to_mul_rcp(shader->ir);
106      do_sub_to_add_neg(shader->ir);
107      do_explog_to_explog2(shader->ir);
108      do_lower_texture_projection(shader->ir);
109      brw_do_cubemap_normalize(shader->ir);
110
111      do {
112	 progress = false;
113
114	 brw_do_channel_expressions(shader->ir);
115	 brw_do_vector_splitting(shader->ir);
116
117	 progress = do_lower_jumps(shader->ir, true, true,
118				   true, /* main return */
119				   false, /* continue */
120				   false /* loops */
121				   ) || progress;
122
123	 progress = do_common_optimization(shader->ir, true, 32) || progress;
124
125	 progress = lower_noise(shader->ir) || progress;
126	 progress =
127	    lower_variable_index_to_cond_assign(shader->ir,
128						GL_TRUE, /* input */
129						GL_TRUE, /* output */
130						GL_TRUE, /* temp */
131						GL_TRUE /* uniform */
132						) || progress;
133      } while (progress);
134
135      validate_ir_tree(shader->ir);
136
137      reparent_ir(shader->ir, shader->ir);
138      talloc_free(mem_ctx);
139   }
140
141   if (!_mesa_ir_link_shader(ctx, prog))
142      return GL_FALSE;
143
144   return GL_TRUE;
145}
146
147static int
148type_size(const struct glsl_type *type)
149{
150   unsigned int size, i;
151
152   switch (type->base_type) {
153   case GLSL_TYPE_UINT:
154   case GLSL_TYPE_INT:
155   case GLSL_TYPE_FLOAT:
156   case GLSL_TYPE_BOOL:
157      return type->components();
158   case GLSL_TYPE_ARRAY:
159      return type_size(type->fields.array) * type->length;
160   case GLSL_TYPE_STRUCT:
161      size = 0;
162      for (i = 0; i < type->length; i++) {
163	 size += type_size(type->fields.structure[i].type);
164      }
165      return size;
166   case GLSL_TYPE_SAMPLER:
167      /* Samplers take up no register space, since they're baked in at
168       * link time.
169       */
170      return 0;
171   default:
172      assert(!"not reached");
173      return 0;
174   }
175}
176
177int
178fs_visitor::virtual_grf_alloc(int size)
179{
180   if (virtual_grf_array_size <= virtual_grf_next) {
181      if (virtual_grf_array_size == 0)
182	 virtual_grf_array_size = 16;
183      else
184	 virtual_grf_array_size *= 2;
185      virtual_grf_sizes = talloc_realloc(mem_ctx, virtual_grf_sizes,
186					 int, virtual_grf_array_size);
187
188      /* This slot is always unused. */
189      virtual_grf_sizes[0] = 0;
190   }
191   virtual_grf_sizes[virtual_grf_next] = size;
192   return virtual_grf_next++;
193}
194
195/** Fixed HW reg constructor. */
196fs_reg::fs_reg(enum register_file file, int hw_reg)
197{
198   init();
199   this->file = file;
200   this->hw_reg = hw_reg;
201   this->type = BRW_REGISTER_TYPE_F;
202}
203
204/** Fixed HW reg constructor. */
205fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
206{
207   init();
208   this->file = file;
209   this->hw_reg = hw_reg;
210   this->type = type;
211}
212
213int
214brw_type_for_base_type(const struct glsl_type *type)
215{
216   switch (type->base_type) {
217   case GLSL_TYPE_FLOAT:
218      return BRW_REGISTER_TYPE_F;
219   case GLSL_TYPE_INT:
220   case GLSL_TYPE_BOOL:
221      return BRW_REGISTER_TYPE_D;
222   case GLSL_TYPE_UINT:
223      return BRW_REGISTER_TYPE_UD;
224   case GLSL_TYPE_ARRAY:
225   case GLSL_TYPE_STRUCT:
226   case GLSL_TYPE_SAMPLER:
227      /* These should be overridden with the type of the member when
228       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
229       * way to trip up if we don't.
230       */
231      return BRW_REGISTER_TYPE_UD;
232   default:
233      assert(!"not reached");
234      return BRW_REGISTER_TYPE_F;
235   }
236}
237
238/** Automatic reg constructor. */
239fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
240{
241   init();
242
243   this->file = GRF;
244   this->reg = v->virtual_grf_alloc(type_size(type));
245   this->reg_offset = 0;
246   this->type = brw_type_for_base_type(type);
247}
248
249fs_reg *
250fs_visitor::variable_storage(ir_variable *var)
251{
252   return (fs_reg *)hash_table_find(this->variable_ht, var);
253}
254
255/* Our support for uniforms is piggy-backed on the struct
256 * gl_fragment_program, because that's where the values actually
257 * get stored, rather than in some global gl_shader_program uniform
258 * store.
259 */
260int
261fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
262{
263   unsigned int offset = 0;
264   float *vec_values;
265
266   if (type->is_matrix()) {
267      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
268							type->vector_elements,
269							1);
270
271      for (unsigned int i = 0; i < type->matrix_columns; i++) {
272	 offset += setup_uniform_values(loc + offset, column);
273      }
274
275      return offset;
276   }
277
278   switch (type->base_type) {
279   case GLSL_TYPE_FLOAT:
280   case GLSL_TYPE_UINT:
281   case GLSL_TYPE_INT:
282   case GLSL_TYPE_BOOL:
283      vec_values = fp->Base.Parameters->ParameterValues[loc];
284      for (unsigned int i = 0; i < type->vector_elements; i++) {
285	 unsigned int param = c->prog_data.nr_params++;
286
287	 assert(param < ARRAY_SIZE(c->prog_data.param));
288
289	 switch (type->base_type) {
290	 case GLSL_TYPE_FLOAT:
291	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
292	    break;
293	 case GLSL_TYPE_UINT:
294	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
295	    break;
296	 case GLSL_TYPE_INT:
297	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
298	    break;
299	 case GLSL_TYPE_BOOL:
300	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
301	    break;
302	 }
303
304	 c->prog_data.param[param] = &vec_values[i];
305      }
306      return 1;
307
308   case GLSL_TYPE_STRUCT:
309      for (unsigned int i = 0; i < type->length; i++) {
310	 offset += setup_uniform_values(loc + offset,
311					type->fields.structure[i].type);
312      }
313      return offset;
314
315   case GLSL_TYPE_ARRAY:
316      for (unsigned int i = 0; i < type->length; i++) {
317	 offset += setup_uniform_values(loc + offset, type->fields.array);
318      }
319      return offset;
320
321   case GLSL_TYPE_SAMPLER:
322      /* The sampler takes up a slot, but we don't use any values from it. */
323      return 1;
324
325   default:
326      assert(!"not reached");
327      return 0;
328   }
329}
330
331
332/* Our support for builtin uniforms is even scarier than non-builtin.
333 * It sits on top of the PROG_STATE_VAR parameters that are
334 * automatically updated from GL context state.
335 */
336void
337fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
338{
339   const struct gl_builtin_uniform_desc *statevar = NULL;
340
341   for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
342      statevar = &_mesa_builtin_uniform_desc[i];
343      if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
344	 break;
345   }
346
347   if (!statevar->name) {
348      this->fail = true;
349      printf("Failed to find builtin uniform `%s'\n", ir->name);
350      return;
351   }
352
353   int array_count;
354   if (ir->type->is_array()) {
355      array_count = ir->type->length;
356   } else {
357      array_count = 1;
358   }
359
360   for (int a = 0; a < array_count; a++) {
361      for (unsigned int i = 0; i < statevar->num_elements; i++) {
362	 struct gl_builtin_uniform_element *element = &statevar->elements[i];
363	 int tokens[STATE_LENGTH];
364
365	 memcpy(tokens, element->tokens, sizeof(element->tokens));
366	 if (ir->type->is_array()) {
367	    tokens[1] = a;
368	 }
369
370	 /* This state reference has already been setup by ir_to_mesa,
371	  * but we'll get the same index back here.
372	  */
373	 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
374					       (gl_state_index *)tokens);
375	 float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
376
377	 /* Add each of the unique swizzles of the element as a
378	  * parameter.  This'll end up matching the expected layout of
379	  * the array/matrix/structure we're trying to fill in.
380	  */
381	 int last_swiz = -1;
382	 for (unsigned int i = 0; i < 4; i++) {
383	    int swiz = GET_SWZ(element->swizzle, i);
384	    if (swiz == last_swiz)
385	       break;
386	    last_swiz = swiz;
387
388	    c->prog_data.param_convert[c->prog_data.nr_params] =
389	       PARAM_NO_CONVERT;
390	    c->prog_data.param[c->prog_data.nr_params++] = &vec_values[swiz];
391	 }
392      }
393   }
394}
395
396fs_reg *
397fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
398{
399   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
400   fs_reg wpos = *reg;
401   fs_reg neg_y = this->pixel_y;
402   neg_y.negate = true;
403   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
404
405   /* gl_FragCoord.x */
406   if (ir->pixel_center_integer) {
407      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x));
408   } else {
409      emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)));
410   }
411   wpos.reg_offset++;
412
413   /* gl_FragCoord.y */
414   if (!flip && ir->pixel_center_integer) {
415      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y));
416   } else {
417      fs_reg pixel_y = this->pixel_y;
418      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
419
420      if (flip) {
421	 pixel_y.negate = true;
422	 offset += c->key.drawable_height - 1.0;
423      }
424
425      emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)));
426   }
427   wpos.reg_offset++;
428
429   /* gl_FragCoord.z */
430   emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
431		interp_reg(FRAG_ATTRIB_WPOS, 2)));
432   wpos.reg_offset++;
433
434   /* gl_FragCoord.w: Already set up in emit_interpolation */
435   emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w));
436
437   return reg;
438}
439
440fs_reg *
441fs_visitor::emit_general_interpolation(ir_variable *ir)
442{
443   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
444   /* Interpolation is always in floating point regs. */
445   reg->type = BRW_REGISTER_TYPE_F;
446   fs_reg attr = *reg;
447
448   unsigned int array_elements;
449   const glsl_type *type;
450
451   if (ir->type->is_array()) {
452      array_elements = ir->type->length;
453      if (array_elements == 0) {
454	 this->fail = true;
455      }
456      type = ir->type->fields.array;
457   } else {
458      array_elements = 1;
459      type = ir->type;
460   }
461
462   int location = ir->location;
463   for (unsigned int i = 0; i < array_elements; i++) {
464      for (unsigned int j = 0; j < type->matrix_columns; j++) {
465	 if (urb_setup[location] == -1) {
466	    /* If there's no incoming setup data for this slot, don't
467	     * emit interpolation for it.
468	     */
469	    attr.reg_offset += type->vector_elements;
470	    location++;
471	    continue;
472	 }
473
474	 for (unsigned int c = 0; c < type->vector_elements; c++) {
475	    struct brw_reg interp = interp_reg(location, c);
476	    emit(fs_inst(FS_OPCODE_LINTERP,
477			 attr,
478			 this->delta_x,
479			 this->delta_y,
480			 fs_reg(interp)));
481	    attr.reg_offset++;
482	 }
483
484	 if (intel->gen < 6) {
485	    attr.reg_offset -= type->vector_elements;
486	    for (unsigned int c = 0; c < type->vector_elements; c++) {
487	       emit(fs_inst(BRW_OPCODE_MUL,
488			    attr,
489			    attr,
490			    this->pixel_w));
491	       attr.reg_offset++;
492	    }
493	 }
494	 location++;
495      }
496   }
497
498   return reg;
499}
500
501fs_reg *
502fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
503{
504   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
505
506   /* The frontfacing comes in as a bit in the thread payload. */
507   if (intel->gen >= 6) {
508      emit(fs_inst(BRW_OPCODE_ASR,
509		   *reg,
510		   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
511		   fs_reg(15)));
512      emit(fs_inst(BRW_OPCODE_NOT,
513		   *reg,
514		   *reg));
515      emit(fs_inst(BRW_OPCODE_AND,
516		   *reg,
517		   *reg,
518		   fs_reg(1)));
519   } else {
520      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
521      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
522       * us front face
523       */
524      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP,
525				   *reg,
526				   fs_reg(r1_6ud),
527				   fs_reg(1u << 31)));
528      inst->conditional_mod = BRW_CONDITIONAL_L;
529      emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)));
530   }
531
532   return reg;
533}
534
535fs_inst *
536fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
537{
538   switch (opcode) {
539   case FS_OPCODE_RCP:
540   case FS_OPCODE_RSQ:
541   case FS_OPCODE_SQRT:
542   case FS_OPCODE_EXP2:
543   case FS_OPCODE_LOG2:
544   case FS_OPCODE_SIN:
545   case FS_OPCODE_COS:
546      break;
547   default:
548      assert(!"not reached: bad math opcode");
549      return NULL;
550   }
551
552   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
553    * might be able to do better by doing execsize = 1 math and then
554    * expanding that result out, but we would need to be careful with
555    * masking.
556    */
557   if (intel->gen >= 6 && src.file == UNIFORM) {
558      fs_reg expanded = fs_reg(this, glsl_type::float_type);
559      emit(fs_inst(BRW_OPCODE_MOV, expanded, src));
560      src = expanded;
561   }
562
563   fs_inst *inst = emit(fs_inst(opcode, dst, src));
564
565   if (intel->gen < 6) {
566      inst->base_mrf = 2;
567      inst->mlen = 1;
568   }
569
570   return inst;
571}
572
573fs_inst *
574fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
575{
576   int base_mrf = 2;
577   fs_inst *inst;
578
579   assert(opcode == FS_OPCODE_POW);
580
581   if (intel->gen >= 6) {
582      /* Can't do hstride == 0 args to gen6 math, so expand it out. */
583      if (src0.file == UNIFORM) {
584	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
585	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src0));
586	 src0 = expanded;
587      }
588
589      if (src1.file == UNIFORM) {
590	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
591	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src1));
592	 src1 = expanded;
593      }
594
595      inst = emit(fs_inst(opcode, dst, src0, src1));
596   } else {
597      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1));
598      inst = emit(fs_inst(opcode, dst, src0, reg_null_f));
599
600      inst->base_mrf = base_mrf;
601      inst->mlen = 2;
602   }
603   return inst;
604}
605
606void
607fs_visitor::visit(ir_variable *ir)
608{
609   fs_reg *reg = NULL;
610
611   if (variable_storage(ir))
612      return;
613
614   if (strcmp(ir->name, "gl_FragColor") == 0) {
615      this->frag_color = ir;
616   } else if (strcmp(ir->name, "gl_FragData") == 0) {
617      this->frag_data = ir;
618   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
619      this->frag_depth = ir;
620   }
621
622   if (ir->mode == ir_var_in) {
623      if (!strcmp(ir->name, "gl_FragCoord")) {
624	 reg = emit_fragcoord_interpolation(ir);
625      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
626	 reg = emit_frontfacing_interpolation(ir);
627      } else {
628	 reg = emit_general_interpolation(ir);
629      }
630      assert(reg);
631      hash_table_insert(this->variable_ht, reg, ir);
632      return;
633   }
634
635   if (ir->mode == ir_var_uniform) {
636      int param_index = c->prog_data.nr_params;
637
638      if (!strncmp(ir->name, "gl_", 3)) {
639	 setup_builtin_uniform_values(ir);
640      } else {
641	 setup_uniform_values(ir->location, ir->type);
642      }
643
644      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
645      reg->type = brw_type_for_base_type(ir->type);
646   }
647
648   if (!reg)
649      reg = new(this->mem_ctx) fs_reg(this, ir->type);
650
651   hash_table_insert(this->variable_ht, reg, ir);
652}
653
654void
655fs_visitor::visit(ir_dereference_variable *ir)
656{
657   fs_reg *reg = variable_storage(ir->var);
658   this->result = *reg;
659}
660
661void
662fs_visitor::visit(ir_dereference_record *ir)
663{
664   const glsl_type *struct_type = ir->record->type;
665
666   ir->record->accept(this);
667
668   unsigned int offset = 0;
669   for (unsigned int i = 0; i < struct_type->length; i++) {
670      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
671	 break;
672      offset += type_size(struct_type->fields.structure[i].type);
673   }
674   this->result.reg_offset += offset;
675   this->result.type = brw_type_for_base_type(ir->type);
676}
677
678void
679fs_visitor::visit(ir_dereference_array *ir)
680{
681   ir_constant *index;
682   int element_size;
683
684   ir->array->accept(this);
685   index = ir->array_index->as_constant();
686
687   element_size = type_size(ir->type);
688   this->result.type = brw_type_for_base_type(ir->type);
689
690   if (index) {
691      assert(this->result.file == UNIFORM ||
692	     (this->result.file == GRF &&
693	      this->result.reg != 0));
694      this->result.reg_offset += index->value.i[0] * element_size;
695   } else {
696      assert(!"FINISHME: non-constant array element");
697   }
698}
699
700void
701fs_visitor::visit(ir_expression *ir)
702{
703   unsigned int operand;
704   fs_reg op[2], temp;
705   fs_inst *inst;
706
707   for (operand = 0; operand < ir->get_num_operands(); operand++) {
708      ir->operands[operand]->accept(this);
709      if (this->result.file == BAD_FILE) {
710	 ir_print_visitor v;
711	 printf("Failed to get tree for expression operand:\n");
712	 ir->operands[operand]->accept(&v);
713	 this->fail = true;
714      }
715      op[operand] = this->result;
716
717      /* Matrix expression operands should have been broken down to vector
718       * operations already.
719       */
720      assert(!ir->operands[operand]->type->is_matrix());
721      /* And then those vector operands should have been broken down to scalar.
722       */
723      assert(!ir->operands[operand]->type->is_vector());
724   }
725
726   /* Storage for our result.  If our result goes into an assignment, it will
727    * just get copy-propagated out, so no worries.
728    */
729   this->result = fs_reg(this, ir->type);
730
731   switch (ir->operation) {
732   case ir_unop_logic_not:
733      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
734       * ones complement of the whole register, not just bit 0.
735       */
736      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)));
737      break;
738   case ir_unop_neg:
739      op[0].negate = !op[0].negate;
740      this->result = op[0];
741      break;
742   case ir_unop_abs:
743      op[0].abs = true;
744      this->result = op[0];
745      break;
746   case ir_unop_sign:
747      temp = fs_reg(this, ir->type);
748
749      emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)));
750
751      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
752      inst->conditional_mod = BRW_CONDITIONAL_G;
753      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)));
754      inst->predicated = true;
755
756      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
757      inst->conditional_mod = BRW_CONDITIONAL_L;
758      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)));
759      inst->predicated = true;
760
761      break;
762   case ir_unop_rcp:
763      emit_math(FS_OPCODE_RCP, this->result, op[0]);
764      break;
765
766   case ir_unop_exp2:
767      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
768      break;
769   case ir_unop_log2:
770      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
771      break;
772   case ir_unop_exp:
773   case ir_unop_log:
774      assert(!"not reached: should be handled by ir_explog_to_explog2");
775      break;
776   case ir_unop_sin:
777      emit_math(FS_OPCODE_SIN, this->result, op[0]);
778      break;
779   case ir_unop_cos:
780      emit_math(FS_OPCODE_COS, this->result, op[0]);
781      break;
782
783   case ir_unop_dFdx:
784      emit(fs_inst(FS_OPCODE_DDX, this->result, op[0]));
785      break;
786   case ir_unop_dFdy:
787      emit(fs_inst(FS_OPCODE_DDY, this->result, op[0]));
788      break;
789
790   case ir_binop_add:
791      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1]));
792      break;
793   case ir_binop_sub:
794      assert(!"not reached: should be handled by ir_sub_to_add_neg");
795      break;
796
797   case ir_binop_mul:
798      emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1]));
799      break;
800   case ir_binop_div:
801      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
802      break;
803   case ir_binop_mod:
804      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
805      break;
806
807   case ir_binop_less:
808      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
809      inst->conditional_mod = BRW_CONDITIONAL_L;
810      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
811      break;
812   case ir_binop_greater:
813      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
814      inst->conditional_mod = BRW_CONDITIONAL_G;
815      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
816      break;
817   case ir_binop_lequal:
818      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
819      inst->conditional_mod = BRW_CONDITIONAL_LE;
820      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
821      break;
822   case ir_binop_gequal:
823      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
824      inst->conditional_mod = BRW_CONDITIONAL_GE;
825      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
826      break;
827   case ir_binop_equal:
828   case ir_binop_all_equal: /* same as nequal for scalars */
829      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
830      inst->conditional_mod = BRW_CONDITIONAL_Z;
831      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
832      break;
833   case ir_binop_nequal:
834   case ir_binop_any_nequal: /* same as nequal for scalars */
835      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
836      inst->conditional_mod = BRW_CONDITIONAL_NZ;
837      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
838      break;
839
840   case ir_binop_logic_xor:
841      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
842      break;
843
844   case ir_binop_logic_or:
845      emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
846      break;
847
848   case ir_binop_logic_and:
849      emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
850      break;
851
852   case ir_binop_dot:
853   case ir_unop_any:
854      assert(!"not reached: should be handled by brw_fs_channel_expressions");
855      break;
856
857   case ir_unop_noise:
858      assert(!"not reached: should be handled by lower_noise");
859      break;
860
861   case ir_unop_sqrt:
862      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
863      break;
864
865   case ir_unop_rsq:
866      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
867      break;
868
869   case ir_unop_i2f:
870   case ir_unop_b2f:
871   case ir_unop_b2i:
872   case ir_unop_f2i:
873      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
874      break;
875   case ir_unop_f2b:
876   case ir_unop_i2b:
877      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f)));
878      inst->conditional_mod = BRW_CONDITIONAL_NZ;
879      inst = emit(fs_inst(BRW_OPCODE_AND, this->result,
880			  this->result, fs_reg(1)));
881      break;
882
883   case ir_unop_trunc:
884      emit(fs_inst(BRW_OPCODE_RNDZ, this->result, op[0]));
885      break;
886   case ir_unop_ceil:
887      op[0].negate = !op[0].negate;
888      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
889      this->result.negate = true;
890      break;
891   case ir_unop_floor:
892      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
893      break;
894   case ir_unop_fract:
895      inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0]));
896      break;
897   case ir_unop_round_even:
898      emit(fs_inst(BRW_OPCODE_RNDE, this->result, op[0]));
899      break;
900
901   case ir_binop_min:
902      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
903      inst->conditional_mod = BRW_CONDITIONAL_L;
904
905      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
906      inst->predicated = true;
907      break;
908   case ir_binop_max:
909      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
910      inst->conditional_mod = BRW_CONDITIONAL_G;
911
912      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
913      inst->predicated = true;
914      break;
915
916   case ir_binop_pow:
917      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
918      break;
919
920   case ir_unop_bit_not:
921      inst = emit(fs_inst(BRW_OPCODE_NOT, this->result, op[0]));
922      break;
923   case ir_binop_bit_and:
924      inst = emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
925      break;
926   case ir_binop_bit_xor:
927      inst = emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
928      break;
929   case ir_binop_bit_or:
930      inst = emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
931      break;
932
933   case ir_unop_u2f:
934   case ir_binop_lshift:
935   case ir_binop_rshift:
936      assert(!"GLSL 1.30 features unsupported");
937      break;
938   }
939}
940
941void
942fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
943				   const glsl_type *type, bool predicated)
944{
945   switch (type->base_type) {
946   case GLSL_TYPE_FLOAT:
947   case GLSL_TYPE_UINT:
948   case GLSL_TYPE_INT:
949   case GLSL_TYPE_BOOL:
950      for (unsigned int i = 0; i < type->components(); i++) {
951	 l.type = brw_type_for_base_type(type);
952	 r.type = brw_type_for_base_type(type);
953
954	 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
955	 inst->predicated = predicated;
956
957	 l.reg_offset++;
958	 r.reg_offset++;
959      }
960      break;
961   case GLSL_TYPE_ARRAY:
962      for (unsigned int i = 0; i < type->length; i++) {
963	 emit_assignment_writes(l, r, type->fields.array, predicated);
964      }
965      break;
966
967   case GLSL_TYPE_STRUCT:
968      for (unsigned int i = 0; i < type->length; i++) {
969	 emit_assignment_writes(l, r, type->fields.structure[i].type,
970				predicated);
971      }
972      break;
973
974   case GLSL_TYPE_SAMPLER:
975      break;
976
977   default:
978      assert(!"not reached");
979      break;
980   }
981}
982
983void
984fs_visitor::visit(ir_assignment *ir)
985{
986   struct fs_reg l, r;
987   fs_inst *inst;
988
989   /* FINISHME: arrays on the lhs */
990   ir->lhs->accept(this);
991   l = this->result;
992
993   ir->rhs->accept(this);
994   r = this->result;
995
996   assert(l.file != BAD_FILE);
997   assert(r.file != BAD_FILE);
998
999   if (ir->condition) {
1000      emit_bool_to_cond_code(ir->condition);
1001   }
1002
1003   if (ir->lhs->type->is_scalar() ||
1004       ir->lhs->type->is_vector()) {
1005      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1006	 if (ir->write_mask & (1 << i)) {
1007	    inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1008	    if (ir->condition)
1009	       inst->predicated = true;
1010	    r.reg_offset++;
1011	 }
1012	 l.reg_offset++;
1013      }
1014   } else {
1015      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1016   }
1017}
1018
1019fs_inst *
1020fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1021{
1022   int mlen;
1023   int base_mrf = 1;
1024   bool simd16 = false;
1025   fs_reg orig_dst;
1026
1027   /* g0 header. */
1028   mlen = 1;
1029
1030   if (ir->shadow_comparitor) {
1031      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1032	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1033		      coordinate));
1034	 coordinate.reg_offset++;
1035      }
1036      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1037      mlen += 3;
1038
1039      if (ir->op == ir_tex) {
1040	 /* There's no plain shadow compare message, so we use shadow
1041	  * compare with a bias of 0.0.
1042	  */
1043	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1044		      fs_reg(0.0f)));
1045	 mlen++;
1046      } else if (ir->op == ir_txb) {
1047	 ir->lod_info.bias->accept(this);
1048	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1049		      this->result));
1050	 mlen++;
1051      } else {
1052	 assert(ir->op == ir_txl);
1053	 ir->lod_info.lod->accept(this);
1054	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1055		      this->result));
1056	 mlen++;
1057      }
1058
1059      ir->shadow_comparitor->accept(this);
1060      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1061      mlen++;
1062   } else if (ir->op == ir_tex) {
1063      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1064	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1065		      coordinate));
1066	 coordinate.reg_offset++;
1067      }
1068      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1069      mlen += 3;
1070   } else {
1071      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1072       * instructions.  We'll need to do SIMD16 here.
1073       */
1074      assert(ir->op == ir_txb || ir->op == ir_txl);
1075
1076      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1077	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2),
1078		      coordinate));
1079	 coordinate.reg_offset++;
1080      }
1081
1082      /* lod/bias appears after u/v/r. */
1083      mlen += 6;
1084
1085      if (ir->op == ir_txb) {
1086	 ir->lod_info.bias->accept(this);
1087	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1088		      this->result));
1089	 mlen++;
1090      } else {
1091	 ir->lod_info.lod->accept(this);
1092	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1093		      this->result));
1094	 mlen++;
1095      }
1096
1097      /* The unused upper half. */
1098      mlen++;
1099
1100      /* Now, since we're doing simd16, the return is 2 interleaved
1101       * vec4s where the odd-indexed ones are junk. We'll need to move
1102       * this weirdness around to the expected layout.
1103       */
1104      simd16 = true;
1105      orig_dst = dst;
1106      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1107						       2));
1108      dst.type = BRW_REGISTER_TYPE_F;
1109   }
1110
1111   fs_inst *inst = NULL;
1112   switch (ir->op) {
1113   case ir_tex:
1114      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1115      break;
1116   case ir_txb:
1117      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1118      break;
1119   case ir_txl:
1120      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1121      break;
1122   case ir_txd:
1123   case ir_txf:
1124      assert(!"GLSL 1.30 features unsupported");
1125      break;
1126   }
1127   inst->base_mrf = base_mrf;
1128   inst->mlen = mlen;
1129
1130   if (simd16) {
1131      for (int i = 0; i < 4; i++) {
1132	 emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst));
1133	 orig_dst.reg_offset++;
1134	 dst.reg_offset += 2;
1135      }
1136   }
1137
1138   return inst;
1139}
1140
1141fs_inst *
1142fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1143{
1144   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1145    * optional parameters like shadow comparitor or LOD bias.  If
1146    * optional parameters aren't present, those base slots are
1147    * optional and don't need to be included in the message.
1148    *
1149    * We don't fill in the unnecessary slots regardless, which may
1150    * look surprising in the disassembly.
1151    */
1152   int mlen = 1; /* g0 header always present. */
1153   int base_mrf = 1;
1154
1155   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1156      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1157		   coordinate));
1158      coordinate.reg_offset++;
1159   }
1160   mlen += ir->coordinate->type->vector_elements;
1161
1162   if (ir->shadow_comparitor) {
1163      mlen = MAX2(mlen, 5);
1164
1165      ir->shadow_comparitor->accept(this);
1166      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1167      mlen++;
1168   }
1169
1170   fs_inst *inst = NULL;
1171   switch (ir->op) {
1172   case ir_tex:
1173      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1174      break;
1175   case ir_txb:
1176      ir->lod_info.bias->accept(this);
1177      mlen = MAX2(mlen, 5);
1178      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1179      mlen++;
1180
1181      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1182      break;
1183   case ir_txl:
1184      ir->lod_info.lod->accept(this);
1185      mlen = MAX2(mlen, 5);
1186      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1187      mlen++;
1188
1189      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1190      break;
1191   case ir_txd:
1192   case ir_txf:
1193      assert(!"GLSL 1.30 features unsupported");
1194      break;
1195   }
1196   inst->base_mrf = base_mrf;
1197   inst->mlen = mlen;
1198
1199   return inst;
1200}
1201
1202void
1203fs_visitor::visit(ir_texture *ir)
1204{
1205   int sampler;
1206   fs_inst *inst = NULL;
1207
1208   ir->coordinate->accept(this);
1209   fs_reg coordinate = this->result;
1210
1211   /* Should be lowered by do_lower_texture_projection */
1212   assert(!ir->projector);
1213
1214   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1215					     ctx->Shader.CurrentFragmentProgram,
1216					     &brw->fragment_program->Base);
1217   sampler = c->fp->program.Base.SamplerUnits[sampler];
1218
1219   /* The 965 requires the EU to do the normalization of GL rectangle
1220    * texture coordinates.  We use the program parameter state
1221    * tracking to get the scaling factor.
1222    */
1223   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1224      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1225      int tokens[STATE_LENGTH] = {
1226	 STATE_INTERNAL,
1227	 STATE_TEXRECT_SCALE,
1228	 sampler,
1229	 0,
1230	 0
1231      };
1232
1233      c->prog_data.param_convert[c->prog_data.nr_params] =
1234	 PARAM_NO_CONVERT;
1235      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1236	 PARAM_NO_CONVERT;
1237
1238      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1239      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1240      GLuint index = _mesa_add_state_reference(params,
1241					       (gl_state_index *)tokens);
1242      float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
1243
1244      c->prog_data.param[c->prog_data.nr_params++] = &vec_values[0];
1245      c->prog_data.param[c->prog_data.nr_params++] = &vec_values[1];
1246
1247      fs_reg dst = fs_reg(this, ir->coordinate->type);
1248      fs_reg src = coordinate;
1249      coordinate = dst;
1250
1251      emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_x));
1252      dst.reg_offset++;
1253      src.reg_offset++;
1254      emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_y));
1255   }
1256
1257   /* Writemasking doesn't eliminate channels on SIMD8 texture
1258    * samples, so don't worry about them.
1259    */
1260   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1261
1262   if (intel->gen < 5) {
1263      inst = emit_texture_gen4(ir, dst, coordinate);
1264   } else {
1265      inst = emit_texture_gen5(ir, dst, coordinate);
1266   }
1267
1268   inst->sampler = sampler;
1269
1270   this->result = dst;
1271
1272   if (ir->shadow_comparitor)
1273      inst->shadow_compare = true;
1274
1275   if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1276      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1277
1278      for (int i = 0; i < 4; i++) {
1279	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1280	 fs_reg l = swizzle_dst;
1281	 l.reg_offset += i;
1282
1283	 if (swiz == SWIZZLE_ZERO) {
1284	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f)));
1285	 } else if (swiz == SWIZZLE_ONE) {
1286	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f)));
1287	 } else {
1288	    fs_reg r = dst;
1289	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1290	    emit(fs_inst(BRW_OPCODE_MOV, l, r));
1291	 }
1292      }
1293      this->result = swizzle_dst;
1294   }
1295}
1296
1297void
1298fs_visitor::visit(ir_swizzle *ir)
1299{
1300   ir->val->accept(this);
1301   fs_reg val = this->result;
1302
1303   if (ir->type->vector_elements == 1) {
1304      this->result.reg_offset += ir->mask.x;
1305      return;
1306   }
1307
1308   fs_reg result = fs_reg(this, ir->type);
1309   this->result = result;
1310
1311   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1312      fs_reg channel = val;
1313      int swiz = 0;
1314
1315      switch (i) {
1316      case 0:
1317	 swiz = ir->mask.x;
1318	 break;
1319      case 1:
1320	 swiz = ir->mask.y;
1321	 break;
1322      case 2:
1323	 swiz = ir->mask.z;
1324	 break;
1325      case 3:
1326	 swiz = ir->mask.w;
1327	 break;
1328      }
1329
1330      channel.reg_offset += swiz;
1331      emit(fs_inst(BRW_OPCODE_MOV, result, channel));
1332      result.reg_offset++;
1333   }
1334}
1335
1336void
1337fs_visitor::visit(ir_discard *ir)
1338{
1339   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1340
1341   assert(ir->condition == NULL); /* FINISHME */
1342
1343   emit(fs_inst(FS_OPCODE_DISCARD_NOT, temp, reg_null_d));
1344   emit(fs_inst(FS_OPCODE_DISCARD_AND, reg_null_d, temp));
1345   kill_emitted = true;
1346}
1347
1348void
1349fs_visitor::visit(ir_constant *ir)
1350{
1351   fs_reg reg(this, ir->type);
1352   this->result = reg;
1353
1354   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1355      switch (ir->type->base_type) {
1356      case GLSL_TYPE_FLOAT:
1357	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.f[i])));
1358	 break;
1359      case GLSL_TYPE_UINT:
1360	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.u[i])));
1361	 break;
1362      case GLSL_TYPE_INT:
1363	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.i[i])));
1364	 break;
1365      case GLSL_TYPE_BOOL:
1366	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg((int)ir->value.b[i])));
1367	 break;
1368      default:
1369	 assert(!"Non-float/uint/int/bool constant");
1370      }
1371      reg.reg_offset++;
1372   }
1373}
1374
1375void
1376fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1377{
1378   ir_expression *expr = ir->as_expression();
1379
1380   if (expr) {
1381      fs_reg op[2];
1382      fs_inst *inst;
1383
1384      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1385	 assert(expr->operands[i]->type->is_scalar());
1386
1387	 expr->operands[i]->accept(this);
1388	 op[i] = this->result;
1389      }
1390
1391      switch (expr->operation) {
1392      case ir_unop_logic_not:
1393	 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)));
1394	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1395	 break;
1396
1397      case ir_binop_logic_xor:
1398	 inst = emit(fs_inst(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]));
1399	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1400	 break;
1401
1402      case ir_binop_logic_or:
1403	 inst = emit(fs_inst(BRW_OPCODE_OR, reg_null_d, op[0], op[1]));
1404	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1405	 break;
1406
1407      case ir_binop_logic_and:
1408	 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], op[1]));
1409	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1410	 break;
1411
1412      case ir_unop_f2b:
1413	 if (intel->gen >= 6) {
1414	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d,
1415				op[0], fs_reg(0.0f)));
1416	 } else {
1417	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0]));
1418	 }
1419	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1420	 break;
1421
1422      case ir_unop_i2b:
1423	 if (intel->gen >= 6) {
1424	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)));
1425	 } else {
1426	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0]));
1427	 }
1428	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1429	 break;
1430
1431      case ir_binop_greater:
1432	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1433	 inst->conditional_mod = BRW_CONDITIONAL_G;
1434	 break;
1435      case ir_binop_gequal:
1436	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1437	 inst->conditional_mod = BRW_CONDITIONAL_GE;
1438	 break;
1439      case ir_binop_less:
1440	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1441	 inst->conditional_mod = BRW_CONDITIONAL_L;
1442	 break;
1443      case ir_binop_lequal:
1444	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1445	 inst->conditional_mod = BRW_CONDITIONAL_LE;
1446	 break;
1447      case ir_binop_equal:
1448      case ir_binop_all_equal:
1449	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1450	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1451	 break;
1452      case ir_binop_nequal:
1453      case ir_binop_any_nequal:
1454	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1455	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1456	 break;
1457      default:
1458	 assert(!"not reached");
1459	 this->fail = true;
1460	 break;
1461      }
1462      return;
1463   }
1464
1465   ir->accept(this);
1466
1467   if (intel->gen >= 6) {
1468      fs_inst *inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d,
1469				   this->result, fs_reg(1)));
1470      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1471   } else {
1472      fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, this->result));
1473      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1474   }
1475}
1476
1477/**
1478 * Emit a gen6 IF statement with the comparison folded into the IF
1479 * instruction.
1480 */
1481void
1482fs_visitor::emit_if_gen6(ir_if *ir)
1483{
1484   ir_expression *expr = ir->condition->as_expression();
1485
1486   if (expr) {
1487      fs_reg op[2];
1488      fs_inst *inst;
1489      fs_reg temp;
1490
1491      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1492	 assert(expr->operands[i]->type->is_scalar());
1493
1494	 expr->operands[i]->accept(this);
1495	 op[i] = this->result;
1496      }
1497
1498      switch (expr->operation) {
1499      case ir_unop_logic_not:
1500	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(1)));
1501	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1502	 return;
1503
1504      case ir_binop_logic_xor:
1505	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1506	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1507	 return;
1508
1509      case ir_binop_logic_or:
1510	 temp = fs_reg(this, glsl_type::bool_type);
1511	 emit(fs_inst(BRW_OPCODE_OR, temp, op[0], op[1]));
1512	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1513	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1514	 return;
1515
1516      case ir_binop_logic_and:
1517	 temp = fs_reg(this, glsl_type::bool_type);
1518	 emit(fs_inst(BRW_OPCODE_AND, temp, op[0], op[1]));
1519	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1520	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1521	 return;
1522
1523      case ir_unop_f2b:
1524	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)));
1525	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1526	 return;
1527
1528      case ir_unop_i2b:
1529	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1530	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1531	 return;
1532
1533      case ir_binop_greater:
1534	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1535	 inst->conditional_mod = BRW_CONDITIONAL_G;
1536	 return;
1537      case ir_binop_gequal:
1538	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1539	 inst->conditional_mod = BRW_CONDITIONAL_GE;
1540	 return;
1541      case ir_binop_less:
1542	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1543	 inst->conditional_mod = BRW_CONDITIONAL_L;
1544	 return;
1545      case ir_binop_lequal:
1546	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1547	 inst->conditional_mod = BRW_CONDITIONAL_LE;
1548	 return;
1549      case ir_binop_equal:
1550      case ir_binop_all_equal:
1551	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1552	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1553	 return;
1554      case ir_binop_nequal:
1555      case ir_binop_any_nequal:
1556	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1557	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1558	 return;
1559      default:
1560	 assert(!"not reached");
1561	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1562	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1563	 this->fail = true;
1564	 return;
1565      }
1566      return;
1567   }
1568
1569   ir->condition->accept(this);
1570
1571   fs_inst *inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)));
1572   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1573}
1574
1575void
1576fs_visitor::visit(ir_if *ir)
1577{
1578   fs_inst *inst;
1579
1580   /* Don't point the annotation at the if statement, because then it plus
1581    * the then and else blocks get printed.
1582    */
1583   this->base_ir = ir->condition;
1584
1585   if (intel->gen >= 6) {
1586      emit_if_gen6(ir);
1587   } else {
1588      emit_bool_to_cond_code(ir->condition);
1589
1590      inst = emit(fs_inst(BRW_OPCODE_IF));
1591      inst->predicated = true;
1592   }
1593
1594   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1595      ir_instruction *ir = (ir_instruction *)iter.get();
1596      this->base_ir = ir;
1597
1598      ir->accept(this);
1599   }
1600
1601   if (!ir->else_instructions.is_empty()) {
1602      emit(fs_inst(BRW_OPCODE_ELSE));
1603
1604      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1605	 ir_instruction *ir = (ir_instruction *)iter.get();
1606	 this->base_ir = ir;
1607
1608	 ir->accept(this);
1609      }
1610   }
1611
1612   emit(fs_inst(BRW_OPCODE_ENDIF));
1613}
1614
1615void
1616fs_visitor::visit(ir_loop *ir)
1617{
1618   fs_reg counter = reg_undef;
1619
1620   if (ir->counter) {
1621      this->base_ir = ir->counter;
1622      ir->counter->accept(this);
1623      counter = *(variable_storage(ir->counter));
1624
1625      if (ir->from) {
1626	 this->base_ir = ir->from;
1627	 ir->from->accept(this);
1628
1629	 emit(fs_inst(BRW_OPCODE_MOV, counter, this->result));
1630      }
1631   }
1632
1633   emit(fs_inst(BRW_OPCODE_DO));
1634
1635   if (ir->to) {
1636      this->base_ir = ir->to;
1637      ir->to->accept(this);
1638
1639      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d,
1640				   counter, this->result));
1641      switch (ir->cmp) {
1642      case ir_binop_equal:
1643	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1644	 break;
1645      case ir_binop_nequal:
1646	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1647	 break;
1648      case ir_binop_gequal:
1649	 inst->conditional_mod = BRW_CONDITIONAL_GE;
1650	 break;
1651      case ir_binop_lequal:
1652	 inst->conditional_mod = BRW_CONDITIONAL_LE;
1653	 break;
1654      case ir_binop_greater:
1655	 inst->conditional_mod = BRW_CONDITIONAL_G;
1656	 break;
1657      case ir_binop_less:
1658	 inst->conditional_mod = BRW_CONDITIONAL_L;
1659	 break;
1660      default:
1661	 assert(!"not reached: unknown loop condition");
1662	 this->fail = true;
1663	 break;
1664      }
1665
1666      inst = emit(fs_inst(BRW_OPCODE_BREAK));
1667      inst->predicated = true;
1668   }
1669
1670   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1671      ir_instruction *ir = (ir_instruction *)iter.get();
1672
1673      this->base_ir = ir;
1674      ir->accept(this);
1675   }
1676
1677   if (ir->increment) {
1678      this->base_ir = ir->increment;
1679      ir->increment->accept(this);
1680      emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result));
1681   }
1682
1683   emit(fs_inst(BRW_OPCODE_WHILE));
1684}
1685
1686void
1687fs_visitor::visit(ir_loop_jump *ir)
1688{
1689   switch (ir->mode) {
1690   case ir_loop_jump::jump_break:
1691      emit(fs_inst(BRW_OPCODE_BREAK));
1692      break;
1693   case ir_loop_jump::jump_continue:
1694      emit(fs_inst(BRW_OPCODE_CONTINUE));
1695      break;
1696   }
1697}
1698
1699void
1700fs_visitor::visit(ir_call *ir)
1701{
1702   assert(!"FINISHME");
1703}
1704
1705void
1706fs_visitor::visit(ir_return *ir)
1707{
1708   assert(!"FINISHME");
1709}
1710
1711void
1712fs_visitor::visit(ir_function *ir)
1713{
1714   /* Ignore function bodies other than main() -- we shouldn't see calls to
1715    * them since they should all be inlined before we get to ir_to_mesa.
1716    */
1717   if (strcmp(ir->name, "main") == 0) {
1718      const ir_function_signature *sig;
1719      exec_list empty;
1720
1721      sig = ir->matching_signature(&empty);
1722
1723      assert(sig);
1724
1725      foreach_iter(exec_list_iterator, iter, sig->body) {
1726	 ir_instruction *ir = (ir_instruction *)iter.get();
1727	 this->base_ir = ir;
1728
1729	 ir->accept(this);
1730      }
1731   }
1732}
1733
1734void
1735fs_visitor::visit(ir_function_signature *ir)
1736{
1737   assert(!"not reached");
1738   (void)ir;
1739}
1740
1741fs_inst *
1742fs_visitor::emit(fs_inst inst)
1743{
1744   fs_inst *list_inst = new(mem_ctx) fs_inst;
1745   *list_inst = inst;
1746
1747   list_inst->annotation = this->current_annotation;
1748   list_inst->ir = this->base_ir;
1749
1750   this->instructions.push_tail(list_inst);
1751
1752   return list_inst;
1753}
1754
1755/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1756void
1757fs_visitor::emit_dummy_fs()
1758{
1759   /* Everyone's favorite color. */
1760   emit(fs_inst(BRW_OPCODE_MOV,
1761		fs_reg(MRF, 2),
1762		fs_reg(1.0f)));
1763   emit(fs_inst(BRW_OPCODE_MOV,
1764		fs_reg(MRF, 3),
1765		fs_reg(0.0f)));
1766   emit(fs_inst(BRW_OPCODE_MOV,
1767		fs_reg(MRF, 4),
1768		fs_reg(1.0f)));
1769   emit(fs_inst(BRW_OPCODE_MOV,
1770		fs_reg(MRF, 5),
1771		fs_reg(0.0f)));
1772
1773   fs_inst *write;
1774   write = emit(fs_inst(FS_OPCODE_FB_WRITE,
1775			fs_reg(0),
1776			fs_reg(0)));
1777   write->base_mrf = 0;
1778}
1779
1780/* The register location here is relative to the start of the URB
1781 * data.  It will get adjusted to be a real location before
1782 * generate_code() time.
1783 */
1784struct brw_reg
1785fs_visitor::interp_reg(int location, int channel)
1786{
1787   int regnr = urb_setup[location] * 2 + channel / 2;
1788   int stride = (channel & 1) * 4;
1789
1790   assert(urb_setup[location] != -1);
1791
1792   return brw_vec1_grf(regnr, stride);
1793}
1794
1795/** Emits the interpolation for the varying inputs. */
1796void
1797fs_visitor::emit_interpolation_setup_gen4()
1798{
1799   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1800
1801   this->current_annotation = "compute pixel centers";
1802   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1803   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1804   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1805   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1806   emit(fs_inst(BRW_OPCODE_ADD,
1807		this->pixel_x,
1808		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1809		fs_reg(brw_imm_v(0x10101010))));
1810   emit(fs_inst(BRW_OPCODE_ADD,
1811		this->pixel_y,
1812		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1813		fs_reg(brw_imm_v(0x11001100))));
1814
1815   this->current_annotation = "compute pixel deltas from v0";
1816   if (brw->has_pln) {
1817      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1818      this->delta_y = this->delta_x;
1819      this->delta_y.reg_offset++;
1820   } else {
1821      this->delta_x = fs_reg(this, glsl_type::float_type);
1822      this->delta_y = fs_reg(this, glsl_type::float_type);
1823   }
1824   emit(fs_inst(BRW_OPCODE_ADD,
1825		this->delta_x,
1826		this->pixel_x,
1827		fs_reg(negate(brw_vec1_grf(1, 0)))));
1828   emit(fs_inst(BRW_OPCODE_ADD,
1829		this->delta_y,
1830		this->pixel_y,
1831		fs_reg(negate(brw_vec1_grf(1, 1)))));
1832
1833   this->current_annotation = "compute pos.w and 1/pos.w";
1834   /* Compute wpos.w.  It's always in our setup, since it's needed to
1835    * interpolate the other attributes.
1836    */
1837   this->wpos_w = fs_reg(this, glsl_type::float_type);
1838   emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1839		interp_reg(FRAG_ATTRIB_WPOS, 3)));
1840   /* Compute the pixel 1/W value from wpos.w. */
1841   this->pixel_w = fs_reg(this, glsl_type::float_type);
1842   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1843   this->current_annotation = NULL;
1844}
1845
1846/** Emits the interpolation for the varying inputs. */
1847void
1848fs_visitor::emit_interpolation_setup_gen6()
1849{
1850   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1851
1852   /* If the pixel centers end up used, the setup is the same as for gen4. */
1853   this->current_annotation = "compute pixel centers";
1854   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1855   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1856   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1857   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1858   emit(fs_inst(BRW_OPCODE_ADD,
1859		int_pixel_x,
1860		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1861		fs_reg(brw_imm_v(0x10101010))));
1862   emit(fs_inst(BRW_OPCODE_ADD,
1863		int_pixel_y,
1864		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1865		fs_reg(brw_imm_v(0x11001100))));
1866
1867   /* As of gen6, we can no longer mix float and int sources.  We have
1868    * to turn the integer pixel centers into floats for their actual
1869    * use.
1870    */
1871   this->pixel_x = fs_reg(this, glsl_type::float_type);
1872   this->pixel_y = fs_reg(this, glsl_type::float_type);
1873   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x));
1874   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y));
1875
1876   this->current_annotation = "compute 1/pos.w";
1877   this->wpos_w = fs_reg(brw_vec8_grf(c->key.source_w_reg, 0));
1878   this->pixel_w = fs_reg(this, glsl_type::float_type);
1879   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1880
1881   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
1882   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
1883
1884   this->current_annotation = NULL;
1885}
1886
1887void
1888fs_visitor::emit_fb_writes()
1889{
1890   this->current_annotation = "FB write header";
1891   GLboolean header_present = GL_TRUE;
1892   int nr = 0;
1893
1894   if (intel->gen >= 6 &&
1895       !this->kill_emitted &&
1896       c->key.nr_color_regions == 1) {
1897      header_present = false;
1898   }
1899
1900   if (header_present) {
1901      /* m0, m1 header */
1902      nr += 2;
1903   }
1904
1905   if (c->key.aa_dest_stencil_reg) {
1906      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1907		   fs_reg(brw_vec8_grf(c->key.aa_dest_stencil_reg, 0))));
1908   }
1909
1910   /* Reserve space for color. It'll be filled in per MRT below. */
1911   int color_mrf = nr;
1912   nr += 4;
1913
1914   if (c->key.source_depth_to_render_target) {
1915      if (c->key.computes_depth) {
1916	 /* Hand over gl_FragDepth. */
1917	 assert(this->frag_depth);
1918	 fs_reg depth = *(variable_storage(this->frag_depth));
1919
1920	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth));
1921      } else {
1922	 /* Pass through the payload depth. */
1923	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1924		      fs_reg(brw_vec8_grf(c->key.source_depth_reg, 0))));
1925      }
1926   }
1927
1928   if (c->key.dest_depth_reg) {
1929      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1930		   fs_reg(brw_vec8_grf(c->key.dest_depth_reg, 0))));
1931   }
1932
1933   fs_reg color = reg_undef;
1934   if (this->frag_color)
1935      color = *(variable_storage(this->frag_color));
1936   else if (this->frag_data)
1937      color = *(variable_storage(this->frag_data));
1938
1939   for (int target = 0; target < c->key.nr_color_regions; target++) {
1940      this->current_annotation = talloc_asprintf(this->mem_ctx,
1941						 "FB write target %d",
1942						 target);
1943      if (this->frag_color || this->frag_data) {
1944	 for (int i = 0; i < 4; i++) {
1945	    emit(fs_inst(BRW_OPCODE_MOV,
1946			 fs_reg(MRF, color_mrf + i),
1947			 color));
1948	    color.reg_offset++;
1949	 }
1950      }
1951
1952      if (this->frag_color)
1953	 color.reg_offset -= 4;
1954
1955      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
1956				   reg_undef, reg_undef));
1957      inst->target = target;
1958      inst->base_mrf = 0;
1959      inst->mlen = nr;
1960      if (target == c->key.nr_color_regions - 1)
1961	 inst->eot = true;
1962      inst->header_present = header_present;
1963   }
1964
1965   if (c->key.nr_color_regions == 0) {
1966      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
1967				   reg_undef, reg_undef));
1968      inst->base_mrf = 0;
1969      inst->mlen = nr;
1970      inst->eot = true;
1971      inst->header_present = header_present;
1972   }
1973
1974   this->current_annotation = NULL;
1975}
1976
1977void
1978fs_visitor::generate_fb_write(fs_inst *inst)
1979{
1980   GLboolean eot = inst->eot;
1981   struct brw_reg implied_header;
1982
1983   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
1984    * move, here's g1.
1985    */
1986   brw_push_insn_state(p);
1987   brw_set_mask_control(p, BRW_MASK_DISABLE);
1988   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1989
1990   if (inst->header_present) {
1991      if (intel->gen >= 6) {
1992	 brw_MOV(p,
1993		 brw_message_reg(inst->base_mrf),
1994		 brw_vec8_grf(0, 0));
1995
1996	 if (inst->target > 0) {
1997	    /* Set the render target index for choosing BLEND_STATE. */
1998	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
1999			      BRW_REGISTER_TYPE_UD),
2000		    brw_imm_ud(inst->target));
2001	 }
2002
2003	 /* Clear viewport index, render target array index. */
2004	 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2005			   BRW_REGISTER_TYPE_UD),
2006		 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2007		 brw_imm_ud(0xf7ff));
2008
2009	 implied_header = brw_null_reg();
2010      } else {
2011	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2012      }
2013
2014      brw_MOV(p,
2015	      brw_message_reg(inst->base_mrf + 1),
2016	      brw_vec8_grf(1, 0));
2017   } else {
2018      implied_header = brw_null_reg();
2019   }
2020
2021   brw_pop_insn_state(p);
2022
2023   brw_fb_WRITE(p,
2024		8, /* dispatch_width */
2025		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
2026		inst->base_mrf,
2027		implied_header,
2028		inst->target,
2029		inst->mlen,
2030		0,
2031		eot);
2032}
2033
2034void
2035fs_visitor::generate_linterp(fs_inst *inst,
2036			     struct brw_reg dst, struct brw_reg *src)
2037{
2038   struct brw_reg delta_x = src[0];
2039   struct brw_reg delta_y = src[1];
2040   struct brw_reg interp = src[2];
2041
2042   if (brw->has_pln &&
2043       delta_y.nr == delta_x.nr + 1 &&
2044       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2045      brw_PLN(p, dst, interp, delta_x);
2046   } else {
2047      brw_LINE(p, brw_null_reg(), interp, delta_x);
2048      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2049   }
2050}
2051
2052void
2053fs_visitor::generate_math(fs_inst *inst,
2054			  struct brw_reg dst, struct brw_reg *src)
2055{
2056   int op;
2057
2058   switch (inst->opcode) {
2059   case FS_OPCODE_RCP:
2060      op = BRW_MATH_FUNCTION_INV;
2061      break;
2062   case FS_OPCODE_RSQ:
2063      op = BRW_MATH_FUNCTION_RSQ;
2064      break;
2065   case FS_OPCODE_SQRT:
2066      op = BRW_MATH_FUNCTION_SQRT;
2067      break;
2068   case FS_OPCODE_EXP2:
2069      op = BRW_MATH_FUNCTION_EXP;
2070      break;
2071   case FS_OPCODE_LOG2:
2072      op = BRW_MATH_FUNCTION_LOG;
2073      break;
2074   case FS_OPCODE_POW:
2075      op = BRW_MATH_FUNCTION_POW;
2076      break;
2077   case FS_OPCODE_SIN:
2078      op = BRW_MATH_FUNCTION_SIN;
2079      break;
2080   case FS_OPCODE_COS:
2081      op = BRW_MATH_FUNCTION_COS;
2082      break;
2083   default:
2084      assert(!"not reached: unknown math function");
2085      op = 0;
2086      break;
2087   }
2088
2089   if (intel->gen >= 6) {
2090      assert(inst->mlen == 0);
2091
2092      if (inst->opcode == FS_OPCODE_POW) {
2093	 brw_math2(p, dst, op, src[0], src[1]);
2094      } else {
2095	 brw_math(p, dst,
2096		  op,
2097		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2098		  BRW_MATH_SATURATE_NONE,
2099		  0, src[0],
2100		  BRW_MATH_DATA_VECTOR,
2101		  BRW_MATH_PRECISION_FULL);
2102      }
2103   } else {
2104      assert(inst->mlen >= 1);
2105
2106      brw_math(p, dst,
2107	       op,
2108	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2109	       BRW_MATH_SATURATE_NONE,
2110	       inst->base_mrf, src[0],
2111	       BRW_MATH_DATA_VECTOR,
2112	       BRW_MATH_PRECISION_FULL);
2113   }
2114}
2115
2116void
2117fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst)
2118{
2119   int msg_type = -1;
2120   int rlen = 4;
2121   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2122
2123   if (intel->gen >= 5) {
2124      switch (inst->opcode) {
2125      case FS_OPCODE_TEX:
2126	 if (inst->shadow_compare) {
2127	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
2128	 } else {
2129	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
2130	 }
2131	 break;
2132      case FS_OPCODE_TXB:
2133	 if (inst->shadow_compare) {
2134	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE_GEN5;
2135	 } else {
2136	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
2137	 }
2138	 break;
2139      }
2140   } else {
2141      switch (inst->opcode) {
2142      case FS_OPCODE_TEX:
2143	 /* Note that G45 and older determines shadow compare and dispatch width
2144	  * from message length for most messages.
2145	  */
2146	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2147	 if (inst->shadow_compare) {
2148	    assert(inst->mlen == 6);
2149	 } else {
2150	    assert(inst->mlen <= 4);
2151	 }
2152	 break;
2153      case FS_OPCODE_TXB:
2154	 if (inst->shadow_compare) {
2155	    assert(inst->mlen == 6);
2156	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2157	 } else {
2158	    assert(inst->mlen == 9);
2159	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2160	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2161	 }
2162	 break;
2163      }
2164   }
2165   assert(msg_type != -1);
2166
2167   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2168      rlen = 8;
2169      dst = vec16(dst);
2170   }
2171
2172   brw_SAMPLE(p,
2173	      retype(dst, BRW_REGISTER_TYPE_UW),
2174	      inst->base_mrf,
2175	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
2176              SURF_INDEX_TEXTURE(inst->sampler),
2177	      inst->sampler,
2178	      WRITEMASK_XYZW,
2179	      msg_type,
2180	      rlen,
2181	      inst->mlen,
2182	      0,
2183	      1,
2184	      simd_mode);
2185}
2186
2187
2188/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2189 * looking like:
2190 *
2191 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2192 *
2193 * and we're trying to produce:
2194 *
2195 *           DDX                     DDY
2196 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2197 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2198 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2199 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2200 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2201 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2202 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2203 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2204 *
2205 * and add another set of two more subspans if in 16-pixel dispatch mode.
2206 *
2207 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2208 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2209 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2210 * between each other.  We could probably do it like ddx and swizzle the right
2211 * order later, but bail for now and just produce
2212 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2213 */
2214void
2215fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2216{
2217   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2218				 BRW_REGISTER_TYPE_F,
2219				 BRW_VERTICAL_STRIDE_2,
2220				 BRW_WIDTH_2,
2221				 BRW_HORIZONTAL_STRIDE_0,
2222				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2223   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2224				 BRW_REGISTER_TYPE_F,
2225				 BRW_VERTICAL_STRIDE_2,
2226				 BRW_WIDTH_2,
2227				 BRW_HORIZONTAL_STRIDE_0,
2228				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2229   brw_ADD(p, dst, src0, negate(src1));
2230}
2231
2232void
2233fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2234{
2235   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2236				 BRW_REGISTER_TYPE_F,
2237				 BRW_VERTICAL_STRIDE_4,
2238				 BRW_WIDTH_4,
2239				 BRW_HORIZONTAL_STRIDE_0,
2240				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2241   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2242				 BRW_REGISTER_TYPE_F,
2243				 BRW_VERTICAL_STRIDE_4,
2244				 BRW_WIDTH_4,
2245				 BRW_HORIZONTAL_STRIDE_0,
2246				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2247   brw_ADD(p, dst, src0, negate(src1));
2248}
2249
2250void
2251fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2252{
2253   if (intel->gen >= 6) {
2254      /* Gen6 no longer has the mask reg for us to just read the
2255       * active channels from.  However, cmp updates just the channels
2256       * of the flag reg that are enabled, so we can get at the
2257       * channel enables that way.  In this step, make a reg of ones
2258       * we'll compare to.
2259       */
2260      brw_MOV(p, mask, brw_imm_ud(1));
2261   } else {
2262      brw_push_insn_state(p);
2263      brw_set_mask_control(p, BRW_MASK_DISABLE);
2264      brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2265      brw_pop_insn_state(p);
2266   }
2267}
2268
2269void
2270fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2271{
2272   if (intel->gen >= 6) {
2273      struct brw_reg f0 = brw_flag_reg();
2274      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2275
2276      brw_push_insn_state(p);
2277      brw_set_mask_control(p, BRW_MASK_DISABLE);
2278      brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2279      brw_pop_insn_state(p);
2280
2281      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2282	      BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2283      /* Undo CMP's whacking of predication*/
2284      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2285
2286      brw_push_insn_state(p);
2287      brw_set_mask_control(p, BRW_MASK_DISABLE);
2288      brw_AND(p, g1, f0, g1);
2289      brw_pop_insn_state(p);
2290   } else {
2291      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2292
2293      mask = brw_uw1_reg(mask.file, mask.nr, 0);
2294
2295      brw_push_insn_state(p);
2296      brw_set_mask_control(p, BRW_MASK_DISABLE);
2297      brw_AND(p, g0, mask, g0);
2298      brw_pop_insn_state(p);
2299   }
2300}
2301
2302void
2303fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2304{
2305   assert(inst->mlen != 0);
2306
2307   brw_MOV(p,
2308	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2309	   retype(src, BRW_REGISTER_TYPE_UD));
2310   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2311				 inst->offset);
2312}
2313
2314void
2315fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2316{
2317   assert(inst->mlen != 0);
2318
2319   /* Clear any post destination dependencies that would be ignored by
2320    * the block read.  See the B-Spec for pre-gen5 send instruction.
2321    *
2322    * This could use a better solution, since texture sampling and
2323    * math reads could potentially run into it as well -- anywhere
2324    * that we have a SEND with a destination that is a register that
2325    * was written but not read within the last N instructions (what's
2326    * N?  unsure).  This is rare because of dead code elimination, but
2327    * not impossible.
2328    */
2329   if (intel->gen == 4 && !intel->is_g4x)
2330      brw_MOV(p, brw_null_reg(), dst);
2331
2332   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2333				inst->offset);
2334
2335   if (intel->gen == 4 && !intel->is_g4x) {
2336      /* gen4 errata: destination from a send can't be used as a
2337       * destination until it's been read.  Just read it so we don't
2338       * have to worry.
2339       */
2340      brw_MOV(p, brw_null_reg(), dst);
2341   }
2342}
2343
2344
2345void
2346fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2347{
2348   assert(inst->mlen != 0);
2349
2350   /* Clear any post destination dependencies that would be ignored by
2351    * the block read.  See the B-Spec for pre-gen5 send instruction.
2352    *
2353    * This could use a better solution, since texture sampling and
2354    * math reads could potentially run into it as well -- anywhere
2355    * that we have a SEND with a destination that is a register that
2356    * was written but not read within the last N instructions (what's
2357    * N?  unsure).  This is rare because of dead code elimination, but
2358    * not impossible.
2359    */
2360   if (intel->gen == 4 && !intel->is_g4x)
2361      brw_MOV(p, brw_null_reg(), dst);
2362
2363   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2364			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2365
2366   if (intel->gen == 4 && !intel->is_g4x) {
2367      /* gen4 errata: destination from a send can't be used as a
2368       * destination until it's been read.  Just read it so we don't
2369       * have to worry.
2370       */
2371      brw_MOV(p, brw_null_reg(), dst);
2372   }
2373}
2374
2375void
2376fs_visitor::assign_curb_setup()
2377{
2378   c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
2379   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2380
2381   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2382   foreach_iter(exec_list_iterator, iter, this->instructions) {
2383      fs_inst *inst = (fs_inst *)iter.get();
2384
2385      for (unsigned int i = 0; i < 3; i++) {
2386	 if (inst->src[i].file == UNIFORM) {
2387	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2388	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2389						  constant_nr / 8,
2390						  constant_nr % 8);
2391
2392	    inst->src[i].file = FIXED_HW_REG;
2393	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2394	 }
2395      }
2396   }
2397}
2398
2399void
2400fs_visitor::calculate_urb_setup()
2401{
2402   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2403      urb_setup[i] = -1;
2404   }
2405
2406   int urb_next = 0;
2407   /* Figure out where each of the incoming setup attributes lands. */
2408   if (intel->gen >= 6) {
2409      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2410	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2411	    urb_setup[i] = urb_next++;
2412	 }
2413      }
2414   } else {
2415      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2416      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2417	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2418	    int fp_index;
2419
2420	    if (i >= VERT_RESULT_VAR0)
2421	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2422	    else if (i <= VERT_RESULT_TEX7)
2423	       fp_index = i;
2424	    else
2425	       fp_index = -1;
2426
2427	    if (fp_index >= 0)
2428	       urb_setup[fp_index] = urb_next++;
2429	 }
2430      }
2431   }
2432
2433   /* Each attribute is 4 setup channels, each of which is half a reg. */
2434   c->prog_data.urb_read_length = urb_next * 2;
2435}
2436
2437void
2438fs_visitor::assign_urb_setup()
2439{
2440   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2441
2442   /* Offset all the urb_setup[] index by the actual position of the
2443    * setup regs, now that the location of the constants has been chosen.
2444    */
2445   foreach_iter(exec_list_iterator, iter, this->instructions) {
2446      fs_inst *inst = (fs_inst *)iter.get();
2447
2448      if (inst->opcode != FS_OPCODE_LINTERP)
2449	 continue;
2450
2451      assert(inst->src[2].file == FIXED_HW_REG);
2452
2453      inst->src[2].fixed_hw_reg.nr += urb_start;
2454   }
2455
2456   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2457}
2458
2459/**
2460 * Split large virtual GRFs into separate components if we can.
2461 *
2462 * This is mostly duplicated with what brw_fs_vector_splitting does,
2463 * but that's really conservative because it's afraid of doing
2464 * splitting that doesn't result in real progress after the rest of
2465 * the optimization phases, which would cause infinite looping in
2466 * optimization.  We can do it once here, safely.  This also has the
2467 * opportunity to split interpolated values, or maybe even uniforms,
2468 * which we don't have at the IR level.
2469 *
2470 * We want to split, because virtual GRFs are what we register
2471 * allocate and spill (due to contiguousness requirements for some
2472 * instructions), and they're what we naturally generate in the
2473 * codegen process, but most virtual GRFs don't actually need to be
2474 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2475 * live intervals and better dead code elimination and coalescing.
2476 */
2477void
2478fs_visitor::split_virtual_grfs()
2479{
2480   int num_vars = this->virtual_grf_next;
2481   bool split_grf[num_vars];
2482   int new_virtual_grf[num_vars];
2483
2484   /* Try to split anything > 0 sized. */
2485   for (int i = 0; i < num_vars; i++) {
2486      if (this->virtual_grf_sizes[i] != 1)
2487	 split_grf[i] = true;
2488      else
2489	 split_grf[i] = false;
2490   }
2491
2492   if (brw->has_pln) {
2493      /* PLN opcodes rely on the delta_xy being contiguous. */
2494      split_grf[this->delta_x.reg] = false;
2495   }
2496
2497   foreach_iter(exec_list_iterator, iter, this->instructions) {
2498      fs_inst *inst = (fs_inst *)iter.get();
2499
2500      /* Texturing produces 4 contiguous registers, so no splitting. */
2501      if ((inst->opcode == FS_OPCODE_TEX ||
2502	   inst->opcode == FS_OPCODE_TXB ||
2503	   inst->opcode == FS_OPCODE_TXL) &&
2504	  inst->dst.file == GRF) {
2505	 split_grf[inst->dst.reg] = false;
2506      }
2507   }
2508
2509   /* Allocate new space for split regs.  Note that the virtual
2510    * numbers will be contiguous.
2511    */
2512   for (int i = 0; i < num_vars; i++) {
2513      if (split_grf[i]) {
2514	 new_virtual_grf[i] = virtual_grf_alloc(1);
2515	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2516	    int reg = virtual_grf_alloc(1);
2517	    assert(reg == new_virtual_grf[i] + j - 1);
2518	    (void) reg;
2519	 }
2520	 this->virtual_grf_sizes[i] = 1;
2521      }
2522   }
2523
2524   foreach_iter(exec_list_iterator, iter, this->instructions) {
2525      fs_inst *inst = (fs_inst *)iter.get();
2526
2527      if (inst->dst.file == GRF &&
2528	  split_grf[inst->dst.reg] &&
2529	  inst->dst.reg_offset != 0) {
2530	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2531			  inst->dst.reg_offset - 1);
2532	 inst->dst.reg_offset = 0;
2533      }
2534      for (int i = 0; i < 3; i++) {
2535	 if (inst->src[i].file == GRF &&
2536	     split_grf[inst->src[i].reg] &&
2537	     inst->src[i].reg_offset != 0) {
2538	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2539				inst->src[i].reg_offset - 1);
2540	    inst->src[i].reg_offset = 0;
2541	 }
2542      }
2543   }
2544}
2545
2546/**
2547 * Choose accesses from the UNIFORM file to demote to using the pull
2548 * constant buffer.
2549 *
2550 * We allow a fragment shader to have more than the specified minimum
2551 * maximum number of fragment shader uniform components (64).  If
2552 * there are too many of these, they'd fill up all of register space.
2553 * So, this will push some of them out to the pull constant buffer and
2554 * update the program to load them.
2555 */
2556void
2557fs_visitor::setup_pull_constants()
2558{
2559   /* Only allow 16 registers (128 uniform components) as push constants. */
2560   unsigned int max_uniform_components = 16 * 8;
2561   if (c->prog_data.nr_params <= max_uniform_components)
2562      return;
2563
2564   /* Just demote the end of the list.  We could probably do better
2565    * here, demoting things that are rarely used in the program first.
2566    */
2567   int pull_uniform_base = max_uniform_components;
2568   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2569
2570   foreach_iter(exec_list_iterator, iter, this->instructions) {
2571      fs_inst *inst = (fs_inst *)iter.get();
2572
2573      for (int i = 0; i < 3; i++) {
2574	 if (inst->src[i].file != UNIFORM)
2575	    continue;
2576
2577	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2578	 if (uniform_nr < pull_uniform_base)
2579	    continue;
2580
2581	 fs_reg dst = fs_reg(this, glsl_type::float_type);
2582	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2583					      dst);
2584	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2585	 pull->ir = inst->ir;
2586	 pull->annotation = inst->annotation;
2587	 pull->base_mrf = 14;
2588	 pull->mlen = 1;
2589
2590	 inst->insert_before(pull);
2591
2592	 inst->src[i].file = GRF;
2593	 inst->src[i].reg = dst.reg;
2594	 inst->src[i].reg_offset = 0;
2595	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2596      }
2597   }
2598
2599   for (int i = 0; i < pull_uniform_count; i++) {
2600      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2601      c->prog_data.pull_param_convert[i] =
2602	 c->prog_data.param_convert[pull_uniform_base + i];
2603   }
2604   c->prog_data.nr_params -= pull_uniform_count;
2605   c->prog_data.nr_pull_params = pull_uniform_count;
2606}
2607
2608void
2609fs_visitor::calculate_live_intervals()
2610{
2611   int num_vars = this->virtual_grf_next;
2612   int *def = talloc_array(mem_ctx, int, num_vars);
2613   int *use = talloc_array(mem_ctx, int, num_vars);
2614   int loop_depth = 0;
2615   int loop_start = 0;
2616   int bb_header_ip = 0;
2617
2618   for (int i = 0; i < num_vars; i++) {
2619      def[i] = 1 << 30;
2620      use[i] = -1;
2621   }
2622
2623   int ip = 0;
2624   foreach_iter(exec_list_iterator, iter, this->instructions) {
2625      fs_inst *inst = (fs_inst *)iter.get();
2626
2627      if (inst->opcode == BRW_OPCODE_DO) {
2628	 if (loop_depth++ == 0)
2629	    loop_start = ip;
2630      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2631	 loop_depth--;
2632
2633	 if (loop_depth == 0) {
2634	    /* Patches up the use of vars marked for being live across
2635	     * the whole loop.
2636	     */
2637	    for (int i = 0; i < num_vars; i++) {
2638	       if (use[i] == loop_start) {
2639		  use[i] = ip;
2640	       }
2641	    }
2642	 }
2643      } else {
2644	 for (unsigned int i = 0; i < 3; i++) {
2645	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2646	       int reg = inst->src[i].reg;
2647
2648	       if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2649				   def[reg] >= bb_header_ip)) {
2650		  use[reg] = ip;
2651	       } else {
2652		  def[reg] = MIN2(loop_start, def[reg]);
2653		  use[reg] = loop_start;
2654
2655		  /* Nobody else is going to go smash our start to
2656		   * later in the loop now, because def[reg] now
2657		   * points before the bb header.
2658		   */
2659	       }
2660	    }
2661	 }
2662	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2663	    int reg = inst->dst.reg;
2664
2665	    if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2666				!inst->predicated)) {
2667	       def[reg] = MIN2(def[reg], ip);
2668	    } else {
2669	       def[reg] = MIN2(def[reg], loop_start);
2670	    }
2671	 }
2672      }
2673
2674      ip++;
2675
2676      /* Set the basic block header IP.  This is used for determining
2677       * if a complete def of single-register virtual GRF in a loop
2678       * dominates a use in the same basic block.  It's a quick way to
2679       * reduce the live interval range of most register used in a
2680       * loop.
2681       */
2682      if (inst->opcode == BRW_OPCODE_IF ||
2683	  inst->opcode == BRW_OPCODE_ELSE ||
2684	  inst->opcode == BRW_OPCODE_ENDIF ||
2685	  inst->opcode == BRW_OPCODE_DO ||
2686	  inst->opcode == BRW_OPCODE_WHILE ||
2687	  inst->opcode == BRW_OPCODE_BREAK ||
2688	  inst->opcode == BRW_OPCODE_CONTINUE) {
2689	 bb_header_ip = ip;
2690      }
2691   }
2692
2693   talloc_free(this->virtual_grf_def);
2694   talloc_free(this->virtual_grf_use);
2695   this->virtual_grf_def = def;
2696   this->virtual_grf_use = use;
2697}
2698
2699/**
2700 * Attempts to move immediate constants into the immediate
2701 * constant slot of following instructions.
2702 *
2703 * Immediate constants are a bit tricky -- they have to be in the last
2704 * operand slot, you can't do abs/negate on them,
2705 */
2706
2707bool
2708fs_visitor::propagate_constants()
2709{
2710   bool progress = false;
2711
2712   foreach_iter(exec_list_iterator, iter, this->instructions) {
2713      fs_inst *inst = (fs_inst *)iter.get();
2714
2715      if (inst->opcode != BRW_OPCODE_MOV ||
2716	  inst->predicated ||
2717	  inst->dst.file != GRF || inst->src[0].file != IMM ||
2718	  inst->dst.type != inst->src[0].type)
2719	 continue;
2720
2721      /* Don't bother with cases where we should have had the
2722       * operation on the constant folded in GLSL already.
2723       */
2724      if (inst->saturate)
2725	 continue;
2726
2727      /* Found a move of a constant to a GRF.  Find anything else using the GRF
2728       * before it's written, and replace it with the constant if we can.
2729       */
2730      exec_list_iterator scan_iter = iter;
2731      scan_iter.next();
2732      for (; scan_iter.has_next(); scan_iter.next()) {
2733	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2734
2735	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2736	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2737	     scan_inst->opcode == BRW_OPCODE_ELSE ||
2738	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2739	    break;
2740	 }
2741
2742	 for (int i = 2; i >= 0; i--) {
2743	    if (scan_inst->src[i].file != GRF ||
2744		scan_inst->src[i].reg != inst->dst.reg ||
2745		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2746	       continue;
2747
2748	    /* Don't bother with cases where we should have had the
2749	     * operation on the constant folded in GLSL already.
2750	     */
2751	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2752	       continue;
2753
2754	    switch (scan_inst->opcode) {
2755	    case BRW_OPCODE_MOV:
2756	       scan_inst->src[i] = inst->src[0];
2757	       progress = true;
2758	       break;
2759
2760	    case BRW_OPCODE_MUL:
2761	    case BRW_OPCODE_ADD:
2762	       if (i == 1) {
2763		  scan_inst->src[i] = inst->src[0];
2764		  progress = true;
2765	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
2766		  /* Fit this constant in by commuting the operands */
2767		  scan_inst->src[0] = scan_inst->src[1];
2768		  scan_inst->src[1] = inst->src[0];
2769	       }
2770	       break;
2771	    case BRW_OPCODE_CMP:
2772	       if (i == 1) {
2773		  scan_inst->src[i] = inst->src[0];
2774		  progress = true;
2775	       }
2776	    }
2777	 }
2778
2779	 if (scan_inst->dst.file == GRF &&
2780	     scan_inst->dst.reg == inst->dst.reg &&
2781	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2782	      scan_inst->opcode == FS_OPCODE_TEX)) {
2783	    break;
2784	 }
2785      }
2786   }
2787
2788   return progress;
2789}
2790/**
2791 * Must be called after calculate_live_intervales() to remove unused
2792 * writes to registers -- register allocation will fail otherwise
2793 * because something deffed but not used won't be considered to
2794 * interfere with other regs.
2795 */
2796bool
2797fs_visitor::dead_code_eliminate()
2798{
2799   bool progress = false;
2800   int num_vars = this->virtual_grf_next;
2801   bool dead[num_vars];
2802
2803   for (int i = 0; i < num_vars; i++) {
2804      dead[i] = this->virtual_grf_def[i] >= this->virtual_grf_use[i];
2805
2806      if (dead[i]) {
2807	 /* Mark off its interval so it won't interfere with anything. */
2808	 this->virtual_grf_def[i] = -1;
2809	 this->virtual_grf_use[i] = -1;
2810      }
2811   }
2812
2813   foreach_iter(exec_list_iterator, iter, this->instructions) {
2814      fs_inst *inst = (fs_inst *)iter.get();
2815
2816      if (inst->dst.file == GRF && dead[inst->dst.reg]) {
2817	 inst->remove();
2818	 progress = true;
2819      }
2820   }
2821
2822   return progress;
2823}
2824
2825bool
2826fs_visitor::register_coalesce()
2827{
2828   bool progress = false;
2829
2830   foreach_iter(exec_list_iterator, iter, this->instructions) {
2831      fs_inst *inst = (fs_inst *)iter.get();
2832
2833      if (inst->opcode != BRW_OPCODE_MOV ||
2834	  inst->predicated ||
2835	  inst->saturate ||
2836	  inst->dst.file != GRF || inst->src[0].file != GRF ||
2837	  inst->dst.type != inst->src[0].type)
2838	 continue;
2839
2840      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2841       * them: check for no writes to either one until the exit of the
2842       * program.
2843       */
2844      bool interfered = false;
2845      exec_list_iterator scan_iter = iter;
2846      scan_iter.next();
2847      for (; scan_iter.has_next(); scan_iter.next()) {
2848	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2849
2850	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2851	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2852	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2853	    interfered = true;
2854	    iter = scan_iter;
2855	    break;
2856	 }
2857
2858	 if (scan_inst->dst.file == GRF) {
2859	    if (scan_inst->dst.reg == inst->dst.reg &&
2860		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2861		 scan_inst->opcode == FS_OPCODE_TEX)) {
2862	       interfered = true;
2863	       break;
2864	    }
2865	    if (scan_inst->dst.reg == inst->src[0].reg &&
2866		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
2867		 scan_inst->opcode == FS_OPCODE_TEX)) {
2868	       interfered = true;
2869	       break;
2870	    }
2871	 }
2872      }
2873      if (interfered) {
2874	 continue;
2875      }
2876
2877      /* Update live interval so we don't have to recalculate. */
2878      this->virtual_grf_use[inst->src[0].reg] = MAX2(virtual_grf_use[inst->src[0].reg],
2879						     virtual_grf_use[inst->dst.reg]);
2880
2881      /* Rewrite the later usage to point at the source of the move to
2882       * be removed.
2883       */
2884      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
2885	   scan_iter.next()) {
2886	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2887
2888	 for (int i = 0; i < 3; i++) {
2889	    if (scan_inst->src[i].file == GRF &&
2890		scan_inst->src[i].reg == inst->dst.reg &&
2891		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2892	       scan_inst->src[i].reg = inst->src[0].reg;
2893	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
2894	       scan_inst->src[i].abs |= inst->src[0].abs;
2895	       scan_inst->src[i].negate ^= inst->src[0].negate;
2896	       scan_inst->src[i].smear = inst->src[0].smear;
2897	    }
2898	 }
2899      }
2900
2901      inst->remove();
2902      progress = true;
2903   }
2904
2905   return progress;
2906}
2907
2908
2909bool
2910fs_visitor::compute_to_mrf()
2911{
2912   bool progress = false;
2913   int next_ip = 0;
2914
2915   foreach_iter(exec_list_iterator, iter, this->instructions) {
2916      fs_inst *inst = (fs_inst *)iter.get();
2917
2918      int ip = next_ip;
2919      next_ip++;
2920
2921      if (inst->opcode != BRW_OPCODE_MOV ||
2922	  inst->predicated ||
2923	  inst->dst.file != MRF || inst->src[0].file != GRF ||
2924	  inst->dst.type != inst->src[0].type ||
2925	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2926	 continue;
2927
2928      /* Can't compute-to-MRF this GRF if someone else was going to
2929       * read it later.
2930       */
2931      if (this->virtual_grf_use[inst->src[0].reg] > ip)
2932	 continue;
2933
2934      /* Found a move of a GRF to a MRF.  Let's see if we can go
2935       * rewrite the thing that made this GRF to write into the MRF.
2936       */
2937      bool found = false;
2938      fs_inst *scan_inst;
2939      for (scan_inst = (fs_inst *)inst->prev;
2940	   scan_inst->prev != NULL;
2941	   scan_inst = (fs_inst *)scan_inst->prev) {
2942	 /* We don't handle flow control here.  Most computation of
2943	  * values that end up in MRFs are shortly before the MRF
2944	  * write anyway.
2945	  */
2946	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2947	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2948	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2949	    break;
2950	 }
2951
2952	 /* You can't read from an MRF, so if someone else reads our
2953	  * MRF's source GRF that we wanted to rewrite, that stops us.
2954	  */
2955	 bool interfered = false;
2956	 for (int i = 0; i < 3; i++) {
2957	    if (scan_inst->src[i].file == GRF &&
2958		scan_inst->src[i].reg == inst->src[0].reg &&
2959		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2960	       interfered = true;
2961	    }
2962	 }
2963	 if (interfered)
2964	    break;
2965
2966	 if (scan_inst->dst.file == MRF &&
2967	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
2968	    /* Somebody else wrote our MRF here, so we can't can't
2969	     * compute-to-MRF before that.
2970	     */
2971	    break;
2972	 }
2973
2974	 if (scan_inst->mlen > 0) {
2975	    /* Found a SEND instruction, which will do some amount of
2976	     * implied write that may overwrite our MRF that we were
2977	     * hoping to compute-to-MRF somewhere above it.  Nothing
2978	     * we have implied-writes more than 2 MRFs from base_mrf,
2979	     * though.
2980	     */
2981	    int implied_write_len = MIN2(scan_inst->mlen, 2);
2982	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
2983		inst->dst.hw_reg < scan_inst->base_mrf + implied_write_len) {
2984	       break;
2985	    }
2986	 }
2987
2988	 if (scan_inst->dst.file == GRF &&
2989	     scan_inst->dst.reg == inst->src[0].reg) {
2990	    /* Found the last thing to write our reg we want to turn
2991	     * into a compute-to-MRF.
2992	     */
2993
2994	    if (scan_inst->opcode == FS_OPCODE_TEX) {
2995	       /* texturing writes several continuous regs, so we can't
2996		* compute-to-mrf that.
2997		*/
2998	       break;
2999	    }
3000
3001	    /* If it's predicated, it (probably) didn't populate all
3002	     * the channels.
3003	     */
3004	    if (scan_inst->predicated)
3005	       break;
3006
3007	    /* SEND instructions can't have MRF as a destination. */
3008	    if (scan_inst->mlen)
3009	       break;
3010
3011	    if (intel->gen >= 6) {
3012	       /* gen6 math instructions must have the destination be
3013		* GRF, so no compute-to-MRF for them.
3014		*/
3015	       if (scan_inst->opcode == FS_OPCODE_RCP ||
3016		   scan_inst->opcode == FS_OPCODE_RSQ ||
3017		   scan_inst->opcode == FS_OPCODE_SQRT ||
3018		   scan_inst->opcode == FS_OPCODE_EXP2 ||
3019		   scan_inst->opcode == FS_OPCODE_LOG2 ||
3020		   scan_inst->opcode == FS_OPCODE_SIN ||
3021		   scan_inst->opcode == FS_OPCODE_COS ||
3022		   scan_inst->opcode == FS_OPCODE_POW) {
3023		  break;
3024	       }
3025	    }
3026
3027	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3028	       /* Found the creator of our MRF's source value. */
3029	       found = true;
3030	       break;
3031	    }
3032	 }
3033      }
3034      if (found) {
3035	 scan_inst->dst.file = MRF;
3036	 scan_inst->dst.hw_reg = inst->dst.hw_reg;
3037	 scan_inst->saturate |= inst->saturate;
3038	 inst->remove();
3039	 progress = true;
3040      }
3041   }
3042
3043   return progress;
3044}
3045
3046bool
3047fs_visitor::virtual_grf_interferes(int a, int b)
3048{
3049   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3050   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3051
3052   /* For dead code, just check if the def interferes with the other range. */
3053   if (this->virtual_grf_use[a] == -1) {
3054      return (this->virtual_grf_def[a] >= this->virtual_grf_def[b] &&
3055	      this->virtual_grf_def[a] < this->virtual_grf_use[b]);
3056   }
3057   if (this->virtual_grf_use[b] == -1) {
3058      return (this->virtual_grf_def[b] >= this->virtual_grf_def[a] &&
3059	      this->virtual_grf_def[b] < this->virtual_grf_use[a]);
3060   }
3061
3062   return start < end;
3063}
3064
3065static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3066{
3067   struct brw_reg brw_reg;
3068
3069   switch (reg->file) {
3070   case GRF:
3071   case ARF:
3072   case MRF:
3073      if (reg->smear == -1) {
3074	 brw_reg = brw_vec8_reg(reg->file,
3075				reg->hw_reg, 0);
3076      } else {
3077	 brw_reg = brw_vec1_reg(reg->file,
3078				reg->hw_reg, reg->smear);
3079      }
3080      brw_reg = retype(brw_reg, reg->type);
3081      break;
3082   case IMM:
3083      switch (reg->type) {
3084      case BRW_REGISTER_TYPE_F:
3085	 brw_reg = brw_imm_f(reg->imm.f);
3086	 break;
3087      case BRW_REGISTER_TYPE_D:
3088	 brw_reg = brw_imm_d(reg->imm.i);
3089	 break;
3090      case BRW_REGISTER_TYPE_UD:
3091	 brw_reg = brw_imm_ud(reg->imm.u);
3092	 break;
3093      default:
3094	 assert(!"not reached");
3095	 break;
3096      }
3097      break;
3098   case FIXED_HW_REG:
3099      brw_reg = reg->fixed_hw_reg;
3100      break;
3101   case BAD_FILE:
3102      /* Probably unused. */
3103      brw_reg = brw_null_reg();
3104      break;
3105   case UNIFORM:
3106      assert(!"not reached");
3107      brw_reg = brw_null_reg();
3108      break;
3109   }
3110   if (reg->abs)
3111      brw_reg = brw_abs(brw_reg);
3112   if (reg->negate)
3113      brw_reg = negate(brw_reg);
3114
3115   return brw_reg;
3116}
3117
3118void
3119fs_visitor::generate_code()
3120{
3121   int last_native_inst = 0;
3122   struct brw_instruction *if_stack[16], *loop_stack[16];
3123   int if_stack_depth = 0, loop_stack_depth = 0;
3124   int if_depth_in_loop[16];
3125   const char *last_annotation_string = NULL;
3126   ir_instruction *last_annotation_ir = NULL;
3127
3128   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3129      printf("Native code for fragment shader %d:\n",
3130	     ctx->Shader.CurrentFragmentProgram->Name);
3131   }
3132
3133   if_depth_in_loop[loop_stack_depth] = 0;
3134
3135   memset(&if_stack, 0, sizeof(if_stack));
3136   foreach_iter(exec_list_iterator, iter, this->instructions) {
3137      fs_inst *inst = (fs_inst *)iter.get();
3138      struct brw_reg src[3], dst;
3139
3140      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3141	 if (last_annotation_ir != inst->ir) {
3142	    last_annotation_ir = inst->ir;
3143	    if (last_annotation_ir) {
3144	       printf("   ");
3145	       last_annotation_ir->print();
3146	       printf("\n");
3147	    }
3148	 }
3149	 if (last_annotation_string != inst->annotation) {
3150	    last_annotation_string = inst->annotation;
3151	    if (last_annotation_string)
3152	       printf("   %s\n", last_annotation_string);
3153	 }
3154      }
3155
3156      for (unsigned int i = 0; i < 3; i++) {
3157	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3158      }
3159      dst = brw_reg_from_fs_reg(&inst->dst);
3160
3161      brw_set_conditionalmod(p, inst->conditional_mod);
3162      brw_set_predicate_control(p, inst->predicated);
3163
3164      switch (inst->opcode) {
3165      case BRW_OPCODE_MOV:
3166	 brw_MOV(p, dst, src[0]);
3167	 break;
3168      case BRW_OPCODE_ADD:
3169	 brw_ADD(p, dst, src[0], src[1]);
3170	 break;
3171      case BRW_OPCODE_MUL:
3172	 brw_MUL(p, dst, src[0], src[1]);
3173	 break;
3174
3175      case BRW_OPCODE_FRC:
3176	 brw_FRC(p, dst, src[0]);
3177	 break;
3178      case BRW_OPCODE_RNDD:
3179	 brw_RNDD(p, dst, src[0]);
3180	 break;
3181      case BRW_OPCODE_RNDE:
3182	 brw_RNDE(p, dst, src[0]);
3183	 break;
3184      case BRW_OPCODE_RNDZ:
3185	 brw_RNDZ(p, dst, src[0]);
3186	 break;
3187
3188      case BRW_OPCODE_AND:
3189	 brw_AND(p, dst, src[0], src[1]);
3190	 break;
3191      case BRW_OPCODE_OR:
3192	 brw_OR(p, dst, src[0], src[1]);
3193	 break;
3194      case BRW_OPCODE_XOR:
3195	 brw_XOR(p, dst, src[0], src[1]);
3196	 break;
3197      case BRW_OPCODE_NOT:
3198	 brw_NOT(p, dst, src[0]);
3199	 break;
3200      case BRW_OPCODE_ASR:
3201	 brw_ASR(p, dst, src[0], src[1]);
3202	 break;
3203      case BRW_OPCODE_SHR:
3204	 brw_SHR(p, dst, src[0], src[1]);
3205	 break;
3206      case BRW_OPCODE_SHL:
3207	 brw_SHL(p, dst, src[0], src[1]);
3208	 break;
3209
3210      case BRW_OPCODE_CMP:
3211	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3212	 break;
3213      case BRW_OPCODE_SEL:
3214	 brw_SEL(p, dst, src[0], src[1]);
3215	 break;
3216
3217      case BRW_OPCODE_IF:
3218	 assert(if_stack_depth < 16);
3219	 if (inst->src[0].file != BAD_FILE) {
3220	    assert(intel->gen >= 6);
3221	    if_stack[if_stack_depth] = brw_IF_gen6(p, inst->conditional_mod, src[0], src[1]);
3222	 } else {
3223	    if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3224	 }
3225	 if_depth_in_loop[loop_stack_depth]++;
3226	 if_stack_depth++;
3227	 break;
3228
3229      case BRW_OPCODE_ELSE:
3230	 if_stack[if_stack_depth - 1] =
3231	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
3232	 break;
3233      case BRW_OPCODE_ENDIF:
3234	 if_stack_depth--;
3235	 brw_ENDIF(p , if_stack[if_stack_depth]);
3236	 if_depth_in_loop[loop_stack_depth]--;
3237	 break;
3238
3239      case BRW_OPCODE_DO:
3240	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3241	 if_depth_in_loop[loop_stack_depth] = 0;
3242	 break;
3243
3244      case BRW_OPCODE_BREAK:
3245	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3246	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3247	 break;
3248      case BRW_OPCODE_CONTINUE:
3249	 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3250	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3251	 break;
3252
3253      case BRW_OPCODE_WHILE: {
3254	 struct brw_instruction *inst0, *inst1;
3255	 GLuint br = 1;
3256
3257	 if (intel->gen >= 5)
3258	    br = 2;
3259
3260	 assert(loop_stack_depth > 0);
3261	 loop_stack_depth--;
3262	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3263	 /* patch all the BREAK/CONT instructions from last BGNLOOP */
3264	 while (inst0 > loop_stack[loop_stack_depth]) {
3265	    inst0--;
3266	    if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3267		inst0->bits3.if_else.jump_count == 0) {
3268	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3269	    }
3270	    else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3271		     inst0->bits3.if_else.jump_count == 0) {
3272	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3273	    }
3274	 }
3275      }
3276	 break;
3277
3278      case FS_OPCODE_RCP:
3279      case FS_OPCODE_RSQ:
3280      case FS_OPCODE_SQRT:
3281      case FS_OPCODE_EXP2:
3282      case FS_OPCODE_LOG2:
3283      case FS_OPCODE_POW:
3284      case FS_OPCODE_SIN:
3285      case FS_OPCODE_COS:
3286	 generate_math(inst, dst, src);
3287	 break;
3288      case FS_OPCODE_LINTERP:
3289	 generate_linterp(inst, dst, src);
3290	 break;
3291      case FS_OPCODE_TEX:
3292      case FS_OPCODE_TXB:
3293      case FS_OPCODE_TXL:
3294	 generate_tex(inst, dst);
3295	 break;
3296      case FS_OPCODE_DISCARD_NOT:
3297	 generate_discard_not(inst, dst);
3298	 break;
3299      case FS_OPCODE_DISCARD_AND:
3300	 generate_discard_and(inst, src[0]);
3301	 break;
3302      case FS_OPCODE_DDX:
3303	 generate_ddx(inst, dst, src[0]);
3304	 break;
3305      case FS_OPCODE_DDY:
3306	 generate_ddy(inst, dst, src[0]);
3307	 break;
3308
3309      case FS_OPCODE_SPILL:
3310	 generate_spill(inst, src[0]);
3311	 break;
3312
3313      case FS_OPCODE_UNSPILL:
3314	 generate_unspill(inst, dst);
3315	 break;
3316
3317      case FS_OPCODE_PULL_CONSTANT_LOAD:
3318	 generate_pull_constant_load(inst, dst);
3319	 break;
3320
3321      case FS_OPCODE_FB_WRITE:
3322	 generate_fb_write(inst);
3323	 break;
3324      default:
3325	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3326	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3327			  brw_opcodes[inst->opcode].name);
3328	 } else {
3329	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3330	 }
3331	 this->fail = true;
3332      }
3333
3334      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3335	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3336	    if (0) {
3337	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3338		      ((uint32_t *)&p->store[i])[3],
3339		      ((uint32_t *)&p->store[i])[2],
3340		      ((uint32_t *)&p->store[i])[1],
3341		      ((uint32_t *)&p->store[i])[0]);
3342	    }
3343	    brw_disasm(stdout, &p->store[i], intel->gen);
3344	    printf("\n");
3345	 }
3346      }
3347
3348      last_native_inst = p->nr_insn;
3349   }
3350}
3351
3352GLboolean
3353brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3354{
3355   struct intel_context *intel = &brw->intel;
3356   struct gl_context *ctx = &intel->ctx;
3357   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3358
3359   if (!prog)
3360      return GL_FALSE;
3361
3362   struct brw_shader *shader =
3363     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3364   if (!shader)
3365      return GL_FALSE;
3366
3367   /* We always use 8-wide mode, at least for now.  For one, flow
3368    * control only works in 8-wide.  Also, when we're fragment shader
3369    * bound, we're almost always under register pressure as well, so
3370    * 8-wide would save us from the performance cliff of spilling
3371    * regs.
3372    */
3373   c->dispatch_width = 8;
3374
3375   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3376      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3377      _mesa_print_ir(shader->ir, NULL);
3378      printf("\n");
3379   }
3380
3381   /* Now the main event: Visit the shader IR and generate our FS IR for it.
3382    */
3383   fs_visitor v(c, shader);
3384
3385   if (0) {
3386      v.emit_dummy_fs();
3387   } else {
3388      v.calculate_urb_setup();
3389      if (intel->gen < 6)
3390	 v.emit_interpolation_setup_gen4();
3391      else
3392	 v.emit_interpolation_setup_gen6();
3393
3394      /* Generate FS IR for main().  (the visitor only descends into
3395       * functions called "main").
3396       */
3397      foreach_iter(exec_list_iterator, iter, *shader->ir) {
3398	 ir_instruction *ir = (ir_instruction *)iter.get();
3399	 v.base_ir = ir;
3400	 ir->accept(&v);
3401      }
3402
3403      v.emit_fb_writes();
3404
3405      v.split_virtual_grfs();
3406      v.setup_pull_constants();
3407
3408      v.assign_curb_setup();
3409      v.assign_urb_setup();
3410
3411      bool progress;
3412      do {
3413	 progress = false;
3414	 v.calculate_live_intervals();
3415	 progress = v.propagate_constants() || progress;
3416	 progress = v.register_coalesce() || progress;
3417	 progress = v.compute_to_mrf() || progress;
3418	 progress = v.dead_code_eliminate() || progress;
3419      } while (progress);
3420
3421      if (0) {
3422	 /* Debug of register spilling: Go spill everything. */
3423	 int virtual_grf_count = v.virtual_grf_next;
3424	 for (int i = 1; i < virtual_grf_count; i++) {
3425	    v.spill_reg(i);
3426	 }
3427	 v.calculate_live_intervals();
3428      }
3429
3430      if (0)
3431	 v.assign_regs_trivial();
3432      else {
3433	 while (!v.assign_regs()) {
3434	    if (v.fail)
3435	       break;
3436
3437	    v.calculate_live_intervals();
3438	 }
3439      }
3440   }
3441
3442   if (!v.fail)
3443      v.generate_code();
3444
3445   assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
3446
3447   if (v.fail)
3448      return GL_FALSE;
3449
3450   c->prog_data.total_grf = v.grf_used;
3451
3452   return GL_TRUE;
3453}
3454