brw_fs.cpp revision 0cadd32b6dc80455802c04b479ec8e768f93ffe1
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44#include "talloc.h"
45}
46#include "brw_fs.h"
47#include "../glsl/glsl_types.h"
48#include "../glsl/ir_optimization.h"
49#include "../glsl/ir_print_visitor.h"
50
51static int using_new_fs = -1;
52static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
53
54struct gl_shader *
55brw_new_shader(GLcontext *ctx, GLuint name, GLuint type)
56{
57   struct brw_shader *shader;
58
59   shader = talloc_zero(NULL, struct brw_shader);
60   if (shader) {
61      shader->base.Type = type;
62      shader->base.Name = name;
63      _mesa_init_shader(ctx, &shader->base);
64   }
65
66   return &shader->base;
67}
68
69struct gl_shader_program *
70brw_new_shader_program(GLcontext *ctx, GLuint name)
71{
72   struct brw_shader_program *prog;
73   prog = talloc_zero(NULL, struct brw_shader_program);
74   if (prog) {
75      prog->base.Name = name;
76      _mesa_init_shader_program(ctx, &prog->base);
77   }
78   return &prog->base;
79}
80
81GLboolean
82brw_compile_shader(GLcontext *ctx, struct gl_shader *shader)
83{
84   if (!_mesa_ir_compile_shader(ctx, shader))
85      return GL_FALSE;
86
87   return GL_TRUE;
88}
89
90GLboolean
91brw_link_shader(GLcontext *ctx, struct gl_shader_program *prog)
92{
93   struct intel_context *intel = intel_context(ctx);
94   if (using_new_fs == -1)
95      using_new_fs = getenv("INTEL_NEW_FS") != NULL;
96
97   for (unsigned i = 0; i < prog->_NumLinkedShaders; i++) {
98      struct brw_shader *shader = (struct brw_shader *)prog->_LinkedShaders[i];
99
100      if (using_new_fs && shader->base.Type == GL_FRAGMENT_SHADER) {
101	 void *mem_ctx = talloc_new(NULL);
102	 bool progress;
103
104	 if (shader->ir)
105	    talloc_free(shader->ir);
106	 shader->ir = new(shader) exec_list;
107	 clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
108
109	 do_mat_op_to_vec(shader->ir);
110	 do_mod_to_fract(shader->ir);
111	 do_div_to_mul_rcp(shader->ir);
112	 do_sub_to_add_neg(shader->ir);
113	 do_explog_to_explog2(shader->ir);
114	 do_lower_texture_projection(shader->ir);
115	 brw_do_cubemap_normalize(shader->ir);
116
117	 do {
118	    progress = false;
119
120	    brw_do_channel_expressions(shader->ir);
121	    brw_do_vector_splitting(shader->ir);
122
123	    progress = do_lower_jumps(shader->ir, true, true,
124				      true, /* main return */
125				      false, /* continue */
126				      false /* loops */
127				      ) || progress;
128
129	    progress = do_common_optimization(shader->ir, true, 32) || progress;
130
131	    progress = lower_noise(shader->ir) || progress;
132	    progress =
133	       lower_variable_index_to_cond_assign(shader->ir,
134						   GL_TRUE, /* input */
135						   GL_TRUE, /* output */
136						   GL_TRUE, /* temp */
137						   GL_TRUE /* uniform */
138						   ) || progress;
139	    if (intel->gen == 6) {
140	       progress = do_if_to_cond_assign(shader->ir) || progress;
141	    }
142	 } while (progress);
143
144	 validate_ir_tree(shader->ir);
145
146	 reparent_ir(shader->ir, shader->ir);
147	 talloc_free(mem_ctx);
148      }
149   }
150
151   if (!_mesa_ir_link_shader(ctx, prog))
152      return GL_FALSE;
153
154   return GL_TRUE;
155}
156
157static int
158type_size(const struct glsl_type *type)
159{
160   unsigned int size, i;
161
162   switch (type->base_type) {
163   case GLSL_TYPE_UINT:
164   case GLSL_TYPE_INT:
165   case GLSL_TYPE_FLOAT:
166   case GLSL_TYPE_BOOL:
167      return type->components();
168   case GLSL_TYPE_ARRAY:
169      return type_size(type->fields.array) * type->length;
170   case GLSL_TYPE_STRUCT:
171      size = 0;
172      for (i = 0; i < type->length; i++) {
173	 size += type_size(type->fields.structure[i].type);
174      }
175      return size;
176   case GLSL_TYPE_SAMPLER:
177      /* Samplers take up no register space, since they're baked in at
178       * link time.
179       */
180      return 0;
181   default:
182      assert(!"not reached");
183      return 0;
184   }
185}
186
187static const fs_reg reg_undef;
188static const fs_reg reg_null(ARF, BRW_ARF_NULL);
189
190int
191fs_visitor::virtual_grf_alloc(int size)
192{
193   if (virtual_grf_array_size <= virtual_grf_next) {
194      if (virtual_grf_array_size == 0)
195	 virtual_grf_array_size = 16;
196      else
197	 virtual_grf_array_size *= 2;
198      virtual_grf_sizes = talloc_realloc(mem_ctx, virtual_grf_sizes,
199					 int, virtual_grf_array_size);
200
201      /* This slot is always unused. */
202      virtual_grf_sizes[0] = 0;
203   }
204   virtual_grf_sizes[virtual_grf_next] = size;
205   return virtual_grf_next++;
206}
207
208/** Fixed HW reg constructor. */
209fs_reg::fs_reg(enum register_file file, int hw_reg)
210{
211   init();
212   this->file = file;
213   this->hw_reg = hw_reg;
214   this->type = BRW_REGISTER_TYPE_F;
215}
216
217int
218brw_type_for_base_type(const struct glsl_type *type)
219{
220   switch (type->base_type) {
221   case GLSL_TYPE_FLOAT:
222      return BRW_REGISTER_TYPE_F;
223   case GLSL_TYPE_INT:
224   case GLSL_TYPE_BOOL:
225      return BRW_REGISTER_TYPE_D;
226   case GLSL_TYPE_UINT:
227      return BRW_REGISTER_TYPE_UD;
228   case GLSL_TYPE_ARRAY:
229   case GLSL_TYPE_STRUCT:
230      /* These should be overridden with the type of the member when
231       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
232       * way to trip up if we don't.
233       */
234      return BRW_REGISTER_TYPE_UD;
235   default:
236      assert(!"not reached");
237      return BRW_REGISTER_TYPE_F;
238   }
239}
240
241/** Automatic reg constructor. */
242fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
243{
244   init();
245
246   this->file = GRF;
247   this->reg = v->virtual_grf_alloc(type_size(type));
248   this->reg_offset = 0;
249   this->type = brw_type_for_base_type(type);
250}
251
252fs_reg *
253fs_visitor::variable_storage(ir_variable *var)
254{
255   return (fs_reg *)hash_table_find(this->variable_ht, var);
256}
257
258/* Our support for uniforms is piggy-backed on the struct
259 * gl_fragment_program, because that's where the values actually
260 * get stored, rather than in some global gl_shader_program uniform
261 * store.
262 */
263int
264fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
265{
266   unsigned int offset = 0;
267   float *vec_values;
268
269   if (type->is_matrix()) {
270      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
271							type->vector_elements,
272							1);
273
274      for (unsigned int i = 0; i < type->matrix_columns; i++) {
275	 offset += setup_uniform_values(loc + offset, column);
276      }
277
278      return offset;
279   }
280
281   switch (type->base_type) {
282   case GLSL_TYPE_FLOAT:
283   case GLSL_TYPE_UINT:
284   case GLSL_TYPE_INT:
285   case GLSL_TYPE_BOOL:
286      vec_values = fp->Base.Parameters->ParameterValues[loc];
287      for (unsigned int i = 0; i < type->vector_elements; i++) {
288	 c->prog_data.param[c->prog_data.nr_params++] = &vec_values[i];
289      }
290      return 1;
291
292   case GLSL_TYPE_STRUCT:
293      for (unsigned int i = 0; i < type->length; i++) {
294	 offset += setup_uniform_values(loc + offset,
295					type->fields.structure[i].type);
296      }
297      return offset;
298
299   case GLSL_TYPE_ARRAY:
300      for (unsigned int i = 0; i < type->length; i++) {
301	 offset += setup_uniform_values(loc + offset, type->fields.array);
302      }
303      return offset;
304
305   case GLSL_TYPE_SAMPLER:
306      /* The sampler takes up a slot, but we don't use any values from it. */
307      return 1;
308
309   default:
310      assert(!"not reached");
311      return 0;
312   }
313}
314
315
316/* Our support for builtin uniforms is even scarier than non-builtin.
317 * It sits on top of the PROG_STATE_VAR parameters that are
318 * automatically updated from GL context state.
319 */
320void
321fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
322{
323   const struct gl_builtin_uniform_desc *statevar = NULL;
324
325   for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
326      statevar = &_mesa_builtin_uniform_desc[i];
327      if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
328	 break;
329   }
330
331   if (!statevar->name) {
332      this->fail = true;
333      printf("Failed to find builtin uniform `%s'\n", ir->name);
334      return;
335   }
336
337   int array_count;
338   if (ir->type->is_array()) {
339      array_count = ir->type->length;
340   } else {
341      array_count = 1;
342   }
343
344   for (int a = 0; a < array_count; a++) {
345      for (unsigned int i = 0; i < statevar->num_elements; i++) {
346	 struct gl_builtin_uniform_element *element = &statevar->elements[i];
347	 int tokens[STATE_LENGTH];
348
349	 memcpy(tokens, element->tokens, sizeof(element->tokens));
350	 if (ir->type->is_array()) {
351	    tokens[1] = a;
352	 }
353
354	 /* This state reference has already been setup by ir_to_mesa,
355	  * but we'll get the same index back here.
356	  */
357	 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
358					       (gl_state_index *)tokens);
359	 float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
360
361	 /* Add each of the unique swizzles of the element as a
362	  * parameter.  This'll end up matching the expected layout of
363	  * the array/matrix/structure we're trying to fill in.
364	  */
365	 int last_swiz = -1;
366	 for (unsigned int i = 0; i < 4; i++) {
367	    int swiz = GET_SWZ(element->swizzle, i);
368	    if (swiz == last_swiz)
369	       break;
370	    last_swiz = swiz;
371
372	    c->prog_data.param[c->prog_data.nr_params++] = &vec_values[swiz];
373	 }
374      }
375   }
376}
377
378fs_reg *
379fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
380{
381   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
382   fs_reg wpos = *reg;
383   fs_reg neg_y = this->pixel_y;
384   neg_y.negate = true;
385
386   /* gl_FragCoord.x */
387   if (ir->pixel_center_integer) {
388      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x));
389   } else {
390      emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)));
391   }
392   wpos.reg_offset++;
393
394   /* gl_FragCoord.y */
395   if (ir->origin_upper_left && ir->pixel_center_integer) {
396      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y));
397   } else {
398      fs_reg pixel_y = this->pixel_y;
399      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
400
401      if (!ir->origin_upper_left) {
402	 pixel_y.negate = true;
403	 offset += c->key.drawable_height - 1.0;
404      }
405
406      emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)));
407   }
408   wpos.reg_offset++;
409
410   /* gl_FragCoord.z */
411   emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
412		interp_reg(FRAG_ATTRIB_WPOS, 2)));
413   wpos.reg_offset++;
414
415   /* gl_FragCoord.w: Already set up in emit_interpolation */
416   emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w));
417
418   return reg;
419}
420
421fs_reg *
422fs_visitor::emit_general_interpolation(ir_variable *ir)
423{
424   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
425   /* Interpolation is always in floating point regs. */
426   reg->type = BRW_REGISTER_TYPE_F;
427   fs_reg attr = *reg;
428
429   unsigned int array_elements;
430   const glsl_type *type;
431
432   if (ir->type->is_array()) {
433      array_elements = ir->type->length;
434      if (array_elements == 0) {
435	 this->fail = true;
436      }
437      type = ir->type->fields.array;
438   } else {
439      array_elements = 1;
440      type = ir->type;
441   }
442
443   int location = ir->location;
444   for (unsigned int i = 0; i < array_elements; i++) {
445      for (unsigned int j = 0; j < type->matrix_columns; j++) {
446	 if (urb_setup[location] == -1) {
447	    /* If there's no incoming setup data for this slot, don't
448	     * emit interpolation for it.
449	     */
450	    attr.reg_offset += type->vector_elements;
451	    location++;
452	    continue;
453	 }
454
455	 for (unsigned int c = 0; c < type->vector_elements; c++) {
456	    struct brw_reg interp = interp_reg(location, c);
457	    emit(fs_inst(FS_OPCODE_LINTERP,
458			 attr,
459			 this->delta_x,
460			 this->delta_y,
461			 fs_reg(interp)));
462	    attr.reg_offset++;
463	 }
464
465	 if (intel->gen < 6) {
466	    attr.reg_offset -= type->vector_elements;
467	    for (unsigned int c = 0; c < type->vector_elements; c++) {
468	       emit(fs_inst(BRW_OPCODE_MUL,
469			    attr,
470			    attr,
471			    this->pixel_w));
472	       attr.reg_offset++;
473	    }
474	 }
475	 location++;
476      }
477   }
478
479   return reg;
480}
481
482fs_reg *
483fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
484{
485   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
486
487   /* The frontfacing comes in as a bit in the thread payload. */
488   if (intel->gen >= 6) {
489      emit(fs_inst(BRW_OPCODE_ASR,
490		   *reg,
491		   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
492		   fs_reg(15)));
493      emit(fs_inst(BRW_OPCODE_NOT,
494		   *reg,
495		   *reg));
496      emit(fs_inst(BRW_OPCODE_AND,
497		   *reg,
498		   *reg,
499		   fs_reg(1)));
500   } else {
501      fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
502      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
503      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
504       * us front face
505       */
506      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP,
507				   *reg,
508				   fs_reg(r1_6ud),
509				   fs_reg(1u << 31)));
510      inst->conditional_mod = BRW_CONDITIONAL_L;
511      emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)));
512   }
513
514   return reg;
515}
516
517fs_inst *
518fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
519{
520   switch (opcode) {
521   case FS_OPCODE_RCP:
522   case FS_OPCODE_RSQ:
523   case FS_OPCODE_SQRT:
524   case FS_OPCODE_EXP2:
525   case FS_OPCODE_LOG2:
526   case FS_OPCODE_SIN:
527   case FS_OPCODE_COS:
528      break;
529   default:
530      assert(!"not reached: bad math opcode");
531      return NULL;
532   }
533
534   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
535    * might be able to do better by doing execsize = 1 math and then
536    * expanding that result out, but we would need to be careful with
537    * masking.
538    */
539   if (intel->gen >= 6 && src.file == UNIFORM) {
540      fs_reg expanded = fs_reg(this, glsl_type::float_type);
541      emit(fs_inst(BRW_OPCODE_MOV, expanded, src));
542      src = expanded;
543   }
544
545   fs_inst *inst = emit(fs_inst(opcode, dst, src));
546
547   if (intel->gen < 6) {
548      inst->base_mrf = 2;
549      inst->mlen = 1;
550   }
551
552   return inst;
553}
554
555fs_inst *
556fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
557{
558   int base_mrf = 2;
559   fs_inst *inst;
560
561   assert(opcode == FS_OPCODE_POW);
562
563   if (intel->gen >= 6) {
564      /* Can't do hstride == 0 args to gen6 math, so expand it out. */
565      if (src0.file == UNIFORM) {
566	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
567	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src0));
568	 src0 = expanded;
569      }
570
571      if (src1.file == UNIFORM) {
572	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
573	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src1));
574	 src1 = expanded;
575      }
576
577      inst = emit(fs_inst(opcode, dst, src0, src1));
578   } else {
579      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1));
580      inst = emit(fs_inst(opcode, dst, src0, reg_null));
581
582      inst->base_mrf = base_mrf;
583      inst->mlen = 2;
584   }
585   return inst;
586}
587
588void
589fs_visitor::visit(ir_variable *ir)
590{
591   fs_reg *reg = NULL;
592
593   if (variable_storage(ir))
594      return;
595
596   if (strcmp(ir->name, "gl_FragColor") == 0) {
597      this->frag_color = ir;
598   } else if (strcmp(ir->name, "gl_FragData") == 0) {
599      this->frag_data = ir;
600   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
601      this->frag_depth = ir;
602   }
603
604   if (ir->mode == ir_var_in) {
605      if (!strcmp(ir->name, "gl_FragCoord")) {
606	 reg = emit_fragcoord_interpolation(ir);
607      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
608	 reg = emit_frontfacing_interpolation(ir);
609      } else {
610	 reg = emit_general_interpolation(ir);
611      }
612      assert(reg);
613      hash_table_insert(this->variable_ht, reg, ir);
614      return;
615   }
616
617   if (ir->mode == ir_var_uniform) {
618      int param_index = c->prog_data.nr_params;
619
620      if (!strncmp(ir->name, "gl_", 3)) {
621	 setup_builtin_uniform_values(ir);
622      } else {
623	 setup_uniform_values(ir->location, ir->type);
624      }
625
626      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
627   }
628
629   if (!reg)
630      reg = new(this->mem_ctx) fs_reg(this, ir->type);
631
632   hash_table_insert(this->variable_ht, reg, ir);
633}
634
635void
636fs_visitor::visit(ir_dereference_variable *ir)
637{
638   fs_reg *reg = variable_storage(ir->var);
639   this->result = *reg;
640}
641
642void
643fs_visitor::visit(ir_dereference_record *ir)
644{
645   const glsl_type *struct_type = ir->record->type;
646
647   ir->record->accept(this);
648
649   unsigned int offset = 0;
650   for (unsigned int i = 0; i < struct_type->length; i++) {
651      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
652	 break;
653      offset += type_size(struct_type->fields.structure[i].type);
654   }
655   this->result.reg_offset += offset;
656   this->result.type = brw_type_for_base_type(ir->type);
657}
658
659void
660fs_visitor::visit(ir_dereference_array *ir)
661{
662   ir_constant *index;
663   int element_size;
664
665   ir->array->accept(this);
666   index = ir->array_index->as_constant();
667
668   element_size = type_size(ir->type);
669   this->result.type = brw_type_for_base_type(ir->type);
670
671   if (index) {
672      assert(this->result.file == UNIFORM ||
673	     (this->result.file == GRF &&
674	      this->result.reg != 0));
675      this->result.reg_offset += index->value.i[0] * element_size;
676   } else {
677      assert(!"FINISHME: non-constant array element");
678   }
679}
680
681void
682fs_visitor::visit(ir_expression *ir)
683{
684   unsigned int operand;
685   fs_reg op[2], temp;
686   fs_reg result;
687   fs_inst *inst;
688
689   for (operand = 0; operand < ir->get_num_operands(); operand++) {
690      ir->operands[operand]->accept(this);
691      if (this->result.file == BAD_FILE) {
692	 ir_print_visitor v;
693	 printf("Failed to get tree for expression operand:\n");
694	 ir->operands[operand]->accept(&v);
695	 this->fail = true;
696      }
697      op[operand] = this->result;
698
699      /* Matrix expression operands should have been broken down to vector
700       * operations already.
701       */
702      assert(!ir->operands[operand]->type->is_matrix());
703      /* And then those vector operands should have been broken down to scalar.
704       */
705      assert(!ir->operands[operand]->type->is_vector());
706   }
707
708   /* Storage for our result.  If our result goes into an assignment, it will
709    * just get copy-propagated out, so no worries.
710    */
711   this->result = fs_reg(this, ir->type);
712
713   switch (ir->operation) {
714   case ir_unop_logic_not:
715      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], fs_reg(-1)));
716      break;
717   case ir_unop_neg:
718      op[0].negate = !op[0].negate;
719      this->result = op[0];
720      break;
721   case ir_unop_abs:
722      op[0].abs = true;
723      this->result = op[0];
724      break;
725   case ir_unop_sign:
726      temp = fs_reg(this, ir->type);
727
728      emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)));
729
730      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null, op[0], fs_reg(0.0f)));
731      inst->conditional_mod = BRW_CONDITIONAL_G;
732      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)));
733      inst->predicated = true;
734
735      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null, op[0], fs_reg(0.0f)));
736      inst->conditional_mod = BRW_CONDITIONAL_L;
737      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)));
738      inst->predicated = true;
739
740      break;
741   case ir_unop_rcp:
742      emit_math(FS_OPCODE_RCP, this->result, op[0]);
743      break;
744
745   case ir_unop_exp2:
746      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
747      break;
748   case ir_unop_log2:
749      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
750      break;
751   case ir_unop_exp:
752   case ir_unop_log:
753      assert(!"not reached: should be handled by ir_explog_to_explog2");
754      break;
755   case ir_unop_sin:
756      emit_math(FS_OPCODE_SIN, this->result, op[0]);
757      break;
758   case ir_unop_cos:
759      emit_math(FS_OPCODE_COS, this->result, op[0]);
760      break;
761
762   case ir_unop_dFdx:
763      emit(fs_inst(FS_OPCODE_DDX, this->result, op[0]));
764      break;
765   case ir_unop_dFdy:
766      emit(fs_inst(FS_OPCODE_DDY, this->result, op[0]));
767      break;
768
769   case ir_binop_add:
770      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1]));
771      break;
772   case ir_binop_sub:
773      assert(!"not reached: should be handled by ir_sub_to_add_neg");
774      break;
775
776   case ir_binop_mul:
777      emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1]));
778      break;
779   case ir_binop_div:
780      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
781      break;
782   case ir_binop_mod:
783      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
784      break;
785
786   case ir_binop_less:
787      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
788      inst->conditional_mod = BRW_CONDITIONAL_L;
789      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
790      break;
791   case ir_binop_greater:
792      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
793      inst->conditional_mod = BRW_CONDITIONAL_G;
794      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
795      break;
796   case ir_binop_lequal:
797      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
798      inst->conditional_mod = BRW_CONDITIONAL_LE;
799      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
800      break;
801   case ir_binop_gequal:
802      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
803      inst->conditional_mod = BRW_CONDITIONAL_GE;
804      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
805      break;
806   case ir_binop_equal:
807   case ir_binop_all_equal: /* same as nequal for scalars */
808      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
809      inst->conditional_mod = BRW_CONDITIONAL_Z;
810      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
811      break;
812   case ir_binop_nequal:
813   case ir_binop_any_nequal: /* same as nequal for scalars */
814      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
815      inst->conditional_mod = BRW_CONDITIONAL_NZ;
816      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
817      break;
818
819   case ir_binop_logic_xor:
820      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
821      break;
822
823   case ir_binop_logic_or:
824      emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
825      break;
826
827   case ir_binop_logic_and:
828      emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
829      break;
830
831   case ir_binop_dot:
832   case ir_binop_cross:
833   case ir_unop_any:
834      assert(!"not reached: should be handled by brw_fs_channel_expressions");
835      break;
836
837   case ir_unop_noise:
838      assert(!"not reached: should be handled by lower_noise");
839      break;
840
841   case ir_unop_sqrt:
842      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
843      break;
844
845   case ir_unop_rsq:
846      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
847      break;
848
849   case ir_unop_i2f:
850   case ir_unop_b2f:
851   case ir_unop_b2i:
852      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
853      break;
854   case ir_unop_f2i:
855      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
856      break;
857   case ir_unop_f2b:
858   case ir_unop_i2b:
859      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f)));
860      inst->conditional_mod = BRW_CONDITIONAL_NZ;
861
862   case ir_unop_trunc:
863      emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
864      break;
865   case ir_unop_ceil:
866      op[0].negate = ~op[0].negate;
867      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
868      this->result.negate = true;
869      break;
870   case ir_unop_floor:
871      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
872      break;
873   case ir_unop_fract:
874      inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0]));
875      break;
876
877   case ir_binop_min:
878      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
879      inst->conditional_mod = BRW_CONDITIONAL_L;
880
881      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
882      inst->predicated = true;
883      break;
884   case ir_binop_max:
885      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
886      inst->conditional_mod = BRW_CONDITIONAL_G;
887
888      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
889      inst->predicated = true;
890      break;
891
892   case ir_binop_pow:
893      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
894      break;
895
896   case ir_unop_bit_not:
897   case ir_unop_u2f:
898   case ir_binop_lshift:
899   case ir_binop_rshift:
900   case ir_binop_bit_and:
901   case ir_binop_bit_xor:
902   case ir_binop_bit_or:
903      assert(!"GLSL 1.30 features unsupported");
904      break;
905   }
906}
907
908void
909fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
910				   const glsl_type *type, bool predicated)
911{
912   switch (type->base_type) {
913   case GLSL_TYPE_FLOAT:
914   case GLSL_TYPE_UINT:
915   case GLSL_TYPE_INT:
916   case GLSL_TYPE_BOOL:
917      for (unsigned int i = 0; i < type->components(); i++) {
918	 l.type = brw_type_for_base_type(type);
919	 r.type = brw_type_for_base_type(type);
920
921	 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
922	 inst->predicated = predicated;
923
924	 l.reg_offset++;
925	 r.reg_offset++;
926      }
927      break;
928   case GLSL_TYPE_ARRAY:
929      for (unsigned int i = 0; i < type->length; i++) {
930	 emit_assignment_writes(l, r, type->fields.array, predicated);
931      }
932
933   case GLSL_TYPE_STRUCT:
934      for (unsigned int i = 0; i < type->length; i++) {
935	 emit_assignment_writes(l, r, type->fields.structure[i].type,
936				predicated);
937      }
938      break;
939
940   case GLSL_TYPE_SAMPLER:
941      break;
942
943   default:
944      assert(!"not reached");
945      break;
946   }
947}
948
949void
950fs_visitor::visit(ir_assignment *ir)
951{
952   struct fs_reg l, r;
953   fs_inst *inst;
954
955   /* FINISHME: arrays on the lhs */
956   ir->lhs->accept(this);
957   l = this->result;
958
959   ir->rhs->accept(this);
960   r = this->result;
961
962   assert(l.file != BAD_FILE);
963   assert(r.file != BAD_FILE);
964
965   if (ir->condition) {
966      /* Get the condition bool into the predicate. */
967      ir->condition->accept(this);
968      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null, this->result, fs_reg(0)));
969      inst->conditional_mod = BRW_CONDITIONAL_NZ;
970   }
971
972   if (ir->lhs->type->is_scalar() ||
973       ir->lhs->type->is_vector()) {
974      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
975	 if (ir->write_mask & (1 << i)) {
976	    inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
977	    if (ir->condition)
978	       inst->predicated = true;
979	    r.reg_offset++;
980	 }
981	 l.reg_offset++;
982      }
983   } else {
984      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
985   }
986}
987
988fs_inst *
989fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
990{
991   int mlen;
992   int base_mrf = 1;
993   bool simd16 = false;
994   fs_reg orig_dst;
995
996   /* g0 header. */
997   mlen = 1;
998
999   if (ir->shadow_comparitor) {
1000      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1001	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1002		      coordinate));
1003	 coordinate.reg_offset++;
1004      }
1005      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1006      mlen += 3;
1007
1008      if (ir->op == ir_tex) {
1009	 /* There's no plain shadow compare message, so we use shadow
1010	  * compare with a bias of 0.0.
1011	  */
1012	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1013		      fs_reg(0.0f)));
1014	 mlen++;
1015      } else if (ir->op == ir_txb) {
1016	 ir->lod_info.bias->accept(this);
1017	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1018		      this->result));
1019	 mlen++;
1020      } else {
1021	 assert(ir->op == ir_txl);
1022	 ir->lod_info.lod->accept(this);
1023	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1024		      this->result));
1025	 mlen++;
1026      }
1027
1028      ir->shadow_comparitor->accept(this);
1029      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1030      mlen++;
1031   } else if (ir->op == ir_tex) {
1032      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1033	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1034		      coordinate));
1035	 coordinate.reg_offset++;
1036      }
1037      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1038      mlen += 3;
1039   } else {
1040      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1041       * instructions.  We'll need to do SIMD16 here.
1042       */
1043      assert(ir->op == ir_txb || ir->op == ir_txl);
1044
1045      for (int i = 0; i < ir->coordinate->type->vector_elements * 2;) {
1046	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2),
1047		      coordinate));
1048	 coordinate.reg_offset++;
1049      }
1050
1051      /* lod/bias appears after u/v/r. */
1052      mlen += 6;
1053
1054      if (ir->op == ir_txb) {
1055	 ir->lod_info.bias->accept(this);
1056	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1057		      this->result));
1058	 mlen++;
1059      } else {
1060	 ir->lod_info.lod->accept(this);
1061	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1062		      this->result));
1063	 mlen++;
1064      }
1065
1066      /* The unused upper half. */
1067      mlen++;
1068
1069      /* Now, since we're doing simd16, the return is 2 interleaved
1070       * vec4s where the odd-indexed ones are junk. We'll need to move
1071       * this weirdness around to the expected layout.
1072       */
1073      simd16 = true;
1074      orig_dst = dst;
1075      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1076						       2));
1077      dst.type = BRW_REGISTER_TYPE_F;
1078   }
1079
1080   fs_inst *inst = NULL;
1081   switch (ir->op) {
1082   case ir_tex:
1083      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1084      break;
1085   case ir_txb:
1086      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1087      break;
1088   case ir_txl:
1089      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1090      break;
1091   case ir_txd:
1092   case ir_txf:
1093      assert(!"GLSL 1.30 features unsupported");
1094      break;
1095   }
1096   inst->base_mrf = base_mrf;
1097   inst->mlen = mlen;
1098
1099   if (simd16) {
1100      for (int i = 0; i < 4; i++) {
1101	 emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst));
1102	 orig_dst.reg_offset++;
1103	 dst.reg_offset += 2;
1104      }
1105   }
1106
1107   return inst;
1108}
1109
1110fs_inst *
1111fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1112{
1113   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1114    * optional parameters like shadow comparitor or LOD bias.  If
1115    * optional parameters aren't present, those base slots are
1116    * optional and don't need to be included in the message.
1117    *
1118    * We don't fill in the unnecessary slots regardless, which may
1119    * look surprising in the disassembly.
1120    */
1121   int mlen = 1; /* g0 header always present. */
1122   int base_mrf = 1;
1123
1124   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1125      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1126		   coordinate));
1127      coordinate.reg_offset++;
1128   }
1129   mlen += ir->coordinate->type->vector_elements;
1130
1131   if (ir->shadow_comparitor) {
1132      mlen = MAX2(mlen, 5);
1133
1134      ir->shadow_comparitor->accept(this);
1135      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1136      mlen++;
1137   }
1138
1139   fs_inst *inst = NULL;
1140   switch (ir->op) {
1141   case ir_tex:
1142      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1143      break;
1144   case ir_txb:
1145      ir->lod_info.bias->accept(this);
1146      mlen = MAX2(mlen, 5);
1147      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1148      mlen++;
1149
1150      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1151      break;
1152   case ir_txl:
1153      ir->lod_info.lod->accept(this);
1154      mlen = MAX2(mlen, 5);
1155      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1156      mlen++;
1157
1158      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1159      break;
1160   case ir_txd:
1161   case ir_txf:
1162      assert(!"GLSL 1.30 features unsupported");
1163      break;
1164   }
1165   inst->base_mrf = base_mrf;
1166   inst->mlen = mlen;
1167
1168   return inst;
1169}
1170
1171void
1172fs_visitor::visit(ir_texture *ir)
1173{
1174   fs_inst *inst = NULL;
1175
1176   ir->coordinate->accept(this);
1177   fs_reg coordinate = this->result;
1178
1179   /* Should be lowered by do_lower_texture_projection */
1180   assert(!ir->projector);
1181
1182   /* Writemasking doesn't eliminate channels on SIMD8 texture
1183    * samples, so don't worry about them.
1184    */
1185   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1186
1187   if (intel->gen < 5) {
1188      inst = emit_texture_gen4(ir, dst, coordinate);
1189   } else {
1190      inst = emit_texture_gen5(ir, dst, coordinate);
1191   }
1192
1193   inst->sampler =
1194      _mesa_get_sampler_uniform_value(ir->sampler,
1195				      ctx->Shader.CurrentProgram,
1196				      &brw->fragment_program->Base);
1197   inst->sampler = c->fp->program.Base.SamplerUnits[inst->sampler];
1198
1199   this->result = dst;
1200
1201   if (ir->shadow_comparitor)
1202      inst->shadow_compare = true;
1203
1204   if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1205      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1206
1207      for (int i = 0; i < 4; i++) {
1208	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1209	 fs_reg l = swizzle_dst;
1210	 l.reg_offset += i;
1211
1212	 if (swiz == SWIZZLE_ZERO) {
1213	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f)));
1214	 } else if (swiz == SWIZZLE_ONE) {
1215	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f)));
1216	 } else {
1217	    fs_reg r = dst;
1218	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1219	    emit(fs_inst(BRW_OPCODE_MOV, l, r));
1220	 }
1221      }
1222      this->result = swizzle_dst;
1223   }
1224}
1225
1226void
1227fs_visitor::visit(ir_swizzle *ir)
1228{
1229   ir->val->accept(this);
1230   fs_reg val = this->result;
1231
1232   if (ir->type->vector_elements == 1) {
1233      this->result.reg_offset += ir->mask.x;
1234      return;
1235   }
1236
1237   fs_reg result = fs_reg(this, ir->type);
1238   this->result = result;
1239
1240   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1241      fs_reg channel = val;
1242      int swiz = 0;
1243
1244      switch (i) {
1245      case 0:
1246	 swiz = ir->mask.x;
1247	 break;
1248      case 1:
1249	 swiz = ir->mask.y;
1250	 break;
1251      case 2:
1252	 swiz = ir->mask.z;
1253	 break;
1254      case 3:
1255	 swiz = ir->mask.w;
1256	 break;
1257      }
1258
1259      channel.reg_offset += swiz;
1260      emit(fs_inst(BRW_OPCODE_MOV, result, channel));
1261      result.reg_offset++;
1262   }
1263}
1264
1265void
1266fs_visitor::visit(ir_discard *ir)
1267{
1268   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1269
1270   assert(ir->condition == NULL); /* FINISHME */
1271
1272   emit(fs_inst(FS_OPCODE_DISCARD_NOT, temp, reg_null));
1273   emit(fs_inst(FS_OPCODE_DISCARD_AND, reg_null, temp));
1274   kill_emitted = true;
1275}
1276
1277void
1278fs_visitor::visit(ir_constant *ir)
1279{
1280   fs_reg reg(this, ir->type);
1281   this->result = reg;
1282
1283   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1284      switch (ir->type->base_type) {
1285      case GLSL_TYPE_FLOAT:
1286	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.f[i])));
1287	 break;
1288      case GLSL_TYPE_UINT:
1289	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.u[i])));
1290	 break;
1291      case GLSL_TYPE_INT:
1292	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.i[i])));
1293	 break;
1294      case GLSL_TYPE_BOOL:
1295	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg((int)ir->value.b[i])));
1296	 break;
1297      default:
1298	 assert(!"Non-float/uint/int/bool constant");
1299      }
1300      reg.reg_offset++;
1301   }
1302}
1303
1304void
1305fs_visitor::visit(ir_if *ir)
1306{
1307   fs_inst *inst;
1308
1309   /* Don't point the annotation at the if statement, because then it plus
1310    * the then and else blocks get printed.
1311    */
1312   this->base_ir = ir->condition;
1313
1314   /* Generate the condition into the condition code. */
1315   ir->condition->accept(this);
1316   inst = emit(fs_inst(BRW_OPCODE_MOV, fs_reg(brw_null_reg()), this->result));
1317   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1318
1319   inst = emit(fs_inst(BRW_OPCODE_IF));
1320   inst->predicated = true;
1321
1322   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1323      ir_instruction *ir = (ir_instruction *)iter.get();
1324      this->base_ir = ir;
1325
1326      ir->accept(this);
1327   }
1328
1329   if (!ir->else_instructions.is_empty()) {
1330      emit(fs_inst(BRW_OPCODE_ELSE));
1331
1332      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1333	 ir_instruction *ir = (ir_instruction *)iter.get();
1334	 this->base_ir = ir;
1335
1336	 ir->accept(this);
1337      }
1338   }
1339
1340   emit(fs_inst(BRW_OPCODE_ENDIF));
1341}
1342
1343void
1344fs_visitor::visit(ir_loop *ir)
1345{
1346   fs_reg counter = reg_undef;
1347
1348   if (ir->counter) {
1349      this->base_ir = ir->counter;
1350      ir->counter->accept(this);
1351      counter = *(variable_storage(ir->counter));
1352
1353      if (ir->from) {
1354	 this->base_ir = ir->from;
1355	 ir->from->accept(this);
1356
1357	 emit(fs_inst(BRW_OPCODE_MOV, counter, this->result));
1358      }
1359   }
1360
1361   emit(fs_inst(BRW_OPCODE_DO));
1362
1363   if (ir->to) {
1364      this->base_ir = ir->to;
1365      ir->to->accept(this);
1366
1367      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null,
1368				   counter, this->result));
1369      switch (ir->cmp) {
1370      case ir_binop_equal:
1371	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1372	 break;
1373      case ir_binop_nequal:
1374	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1375	 break;
1376      case ir_binop_gequal:
1377	 inst->conditional_mod = BRW_CONDITIONAL_GE;
1378	 break;
1379      case ir_binop_lequal:
1380	 inst->conditional_mod = BRW_CONDITIONAL_LE;
1381	 break;
1382      case ir_binop_greater:
1383	 inst->conditional_mod = BRW_CONDITIONAL_G;
1384	 break;
1385      case ir_binop_less:
1386	 inst->conditional_mod = BRW_CONDITIONAL_L;
1387	 break;
1388      default:
1389	 assert(!"not reached: unknown loop condition");
1390	 this->fail = true;
1391	 break;
1392      }
1393
1394      inst = emit(fs_inst(BRW_OPCODE_BREAK));
1395      inst->predicated = true;
1396   }
1397
1398   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1399      ir_instruction *ir = (ir_instruction *)iter.get();
1400
1401      this->base_ir = ir;
1402      ir->accept(this);
1403   }
1404
1405   if (ir->increment) {
1406      this->base_ir = ir->increment;
1407      ir->increment->accept(this);
1408      emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result));
1409   }
1410
1411   emit(fs_inst(BRW_OPCODE_WHILE));
1412}
1413
1414void
1415fs_visitor::visit(ir_loop_jump *ir)
1416{
1417   switch (ir->mode) {
1418   case ir_loop_jump::jump_break:
1419      emit(fs_inst(BRW_OPCODE_BREAK));
1420      break;
1421   case ir_loop_jump::jump_continue:
1422      emit(fs_inst(BRW_OPCODE_CONTINUE));
1423      break;
1424   }
1425}
1426
1427void
1428fs_visitor::visit(ir_call *ir)
1429{
1430   assert(!"FINISHME");
1431}
1432
1433void
1434fs_visitor::visit(ir_return *ir)
1435{
1436   assert(!"FINISHME");
1437}
1438
1439void
1440fs_visitor::visit(ir_function *ir)
1441{
1442   /* Ignore function bodies other than main() -- we shouldn't see calls to
1443    * them since they should all be inlined before we get to ir_to_mesa.
1444    */
1445   if (strcmp(ir->name, "main") == 0) {
1446      const ir_function_signature *sig;
1447      exec_list empty;
1448
1449      sig = ir->matching_signature(&empty);
1450
1451      assert(sig);
1452
1453      foreach_iter(exec_list_iterator, iter, sig->body) {
1454	 ir_instruction *ir = (ir_instruction *)iter.get();
1455	 this->base_ir = ir;
1456
1457	 ir->accept(this);
1458      }
1459   }
1460}
1461
1462void
1463fs_visitor::visit(ir_function_signature *ir)
1464{
1465   assert(!"not reached");
1466   (void)ir;
1467}
1468
1469fs_inst *
1470fs_visitor::emit(fs_inst inst)
1471{
1472   fs_inst *list_inst = new(mem_ctx) fs_inst;
1473   *list_inst = inst;
1474
1475   list_inst->annotation = this->current_annotation;
1476   list_inst->ir = this->base_ir;
1477
1478   this->instructions.push_tail(list_inst);
1479
1480   return list_inst;
1481}
1482
1483/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1484void
1485fs_visitor::emit_dummy_fs()
1486{
1487   /* Everyone's favorite color. */
1488   emit(fs_inst(BRW_OPCODE_MOV,
1489		fs_reg(MRF, 2),
1490		fs_reg(1.0f)));
1491   emit(fs_inst(BRW_OPCODE_MOV,
1492		fs_reg(MRF, 3),
1493		fs_reg(0.0f)));
1494   emit(fs_inst(BRW_OPCODE_MOV,
1495		fs_reg(MRF, 4),
1496		fs_reg(1.0f)));
1497   emit(fs_inst(BRW_OPCODE_MOV,
1498		fs_reg(MRF, 5),
1499		fs_reg(0.0f)));
1500
1501   fs_inst *write;
1502   write = emit(fs_inst(FS_OPCODE_FB_WRITE,
1503			fs_reg(0),
1504			fs_reg(0)));
1505   write->base_mrf = 0;
1506}
1507
1508/* The register location here is relative to the start of the URB
1509 * data.  It will get adjusted to be a real location before
1510 * generate_code() time.
1511 */
1512struct brw_reg
1513fs_visitor::interp_reg(int location, int channel)
1514{
1515   int regnr = urb_setup[location] * 2 + channel / 2;
1516   int stride = (channel & 1) * 4;
1517
1518   assert(urb_setup[location] != -1);
1519
1520   return brw_vec1_grf(regnr, stride);
1521}
1522
1523/** Emits the interpolation for the varying inputs. */
1524void
1525fs_visitor::emit_interpolation_setup_gen4()
1526{
1527   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1528
1529   this->current_annotation = "compute pixel centers";
1530   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1531   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1532   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1533   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1534   emit(fs_inst(BRW_OPCODE_ADD,
1535		this->pixel_x,
1536		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1537		fs_reg(brw_imm_v(0x10101010))));
1538   emit(fs_inst(BRW_OPCODE_ADD,
1539		this->pixel_y,
1540		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1541		fs_reg(brw_imm_v(0x11001100))));
1542
1543   this->current_annotation = "compute pixel deltas from v0";
1544   if (brw->has_pln) {
1545      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1546      this->delta_y = this->delta_x;
1547      this->delta_y.reg_offset++;
1548   } else {
1549      this->delta_x = fs_reg(this, glsl_type::float_type);
1550      this->delta_y = fs_reg(this, glsl_type::float_type);
1551   }
1552   emit(fs_inst(BRW_OPCODE_ADD,
1553		this->delta_x,
1554		this->pixel_x,
1555		fs_reg(negate(brw_vec1_grf(1, 0)))));
1556   emit(fs_inst(BRW_OPCODE_ADD,
1557		this->delta_y,
1558		this->pixel_y,
1559		fs_reg(negate(brw_vec1_grf(1, 1)))));
1560
1561   this->current_annotation = "compute pos.w and 1/pos.w";
1562   /* Compute wpos.w.  It's always in our setup, since it's needed to
1563    * interpolate the other attributes.
1564    */
1565   this->wpos_w = fs_reg(this, glsl_type::float_type);
1566   emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1567		interp_reg(FRAG_ATTRIB_WPOS, 3)));
1568   /* Compute the pixel 1/W value from wpos.w. */
1569   this->pixel_w = fs_reg(this, glsl_type::float_type);
1570   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1571   this->current_annotation = NULL;
1572}
1573
1574/** Emits the interpolation for the varying inputs. */
1575void
1576fs_visitor::emit_interpolation_setup_gen6()
1577{
1578   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1579
1580   /* If the pixel centers end up used, the setup is the same as for gen4. */
1581   this->current_annotation = "compute pixel centers";
1582   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1583   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1584   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1585   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1586   emit(fs_inst(BRW_OPCODE_ADD,
1587		int_pixel_x,
1588		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1589		fs_reg(brw_imm_v(0x10101010))));
1590   emit(fs_inst(BRW_OPCODE_ADD,
1591		int_pixel_y,
1592		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1593		fs_reg(brw_imm_v(0x11001100))));
1594
1595   /* As of gen6, we can no longer mix float and int sources.  We have
1596    * to turn the integer pixel centers into floats for their actual
1597    * use.
1598    */
1599   this->pixel_x = fs_reg(this, glsl_type::float_type);
1600   this->pixel_y = fs_reg(this, glsl_type::float_type);
1601   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x));
1602   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y));
1603
1604   this->current_annotation = "compute 1/pos.w";
1605   this->wpos_w = fs_reg(brw_vec8_grf(c->key.source_w_reg, 0));
1606   this->pixel_w = fs_reg(this, glsl_type::float_type);
1607   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1608
1609   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
1610   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
1611
1612   this->current_annotation = NULL;
1613}
1614
1615void
1616fs_visitor::emit_fb_writes()
1617{
1618   this->current_annotation = "FB write header";
1619   GLboolean header_present = GL_TRUE;
1620   int nr = 0;
1621
1622   if (intel->gen >= 6 &&
1623       !this->kill_emitted &&
1624       c->key.nr_color_regions == 1) {
1625      header_present = false;
1626   }
1627
1628   if (header_present) {
1629      /* m0, m1 header */
1630      nr += 2;
1631   }
1632
1633   if (c->key.aa_dest_stencil_reg) {
1634      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1635		   fs_reg(brw_vec8_grf(c->key.aa_dest_stencil_reg, 0))));
1636   }
1637
1638   /* Reserve space for color. It'll be filled in per MRT below. */
1639   int color_mrf = nr;
1640   nr += 4;
1641
1642   if (c->key.source_depth_to_render_target) {
1643      if (c->key.computes_depth) {
1644	 /* Hand over gl_FragDepth. */
1645	 assert(this->frag_depth);
1646	 fs_reg depth = *(variable_storage(this->frag_depth));
1647
1648	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth));
1649      } else {
1650	 /* Pass through the payload depth. */
1651	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1652		      fs_reg(brw_vec8_grf(c->key.source_depth_reg, 0))));
1653      }
1654   }
1655
1656   if (c->key.dest_depth_reg) {
1657      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1658		   fs_reg(brw_vec8_grf(c->key.dest_depth_reg, 0))));
1659   }
1660
1661   fs_reg color = reg_undef;
1662   if (this->frag_color)
1663      color = *(variable_storage(this->frag_color));
1664   else if (this->frag_data)
1665      color = *(variable_storage(this->frag_data));
1666
1667   for (int target = 0; target < c->key.nr_color_regions; target++) {
1668      this->current_annotation = talloc_asprintf(this->mem_ctx,
1669						 "FB write target %d",
1670						 target);
1671      if (this->frag_color || this->frag_data) {
1672	 for (int i = 0; i < 4; i++) {
1673	    emit(fs_inst(BRW_OPCODE_MOV,
1674			 fs_reg(MRF, color_mrf + i),
1675			 color));
1676	    color.reg_offset++;
1677	 }
1678      }
1679
1680      if (this->frag_color)
1681	 color.reg_offset -= 4;
1682
1683      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
1684				   reg_undef, reg_undef));
1685      inst->target = target;
1686      inst->base_mrf = 0;
1687      inst->mlen = nr;
1688      if (target == c->key.nr_color_regions - 1)
1689	 inst->eot = true;
1690      inst->header_present = header_present;
1691   }
1692
1693   if (c->key.nr_color_regions == 0) {
1694      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
1695				   reg_undef, reg_undef));
1696      inst->base_mrf = 0;
1697      inst->mlen = nr;
1698      inst->eot = true;
1699      inst->header_present = header_present;
1700   }
1701
1702   this->current_annotation = NULL;
1703}
1704
1705void
1706fs_visitor::generate_fb_write(fs_inst *inst)
1707{
1708   GLboolean eot = inst->eot;
1709   struct brw_reg implied_header;
1710
1711   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
1712    * move, here's g1.
1713    */
1714   brw_push_insn_state(p);
1715   brw_set_mask_control(p, BRW_MASK_DISABLE);
1716   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1717
1718   if (inst->header_present) {
1719      if (intel->gen >= 6) {
1720	 brw_MOV(p,
1721		 brw_message_reg(inst->base_mrf),
1722		 brw_vec8_grf(0, 0));
1723	 implied_header = brw_null_reg();
1724      } else {
1725	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
1726      }
1727
1728      brw_MOV(p,
1729	      brw_message_reg(inst->base_mrf + 1),
1730	      brw_vec8_grf(1, 0));
1731   } else {
1732      implied_header = brw_null_reg();
1733   }
1734
1735   brw_pop_insn_state(p);
1736
1737   brw_fb_WRITE(p,
1738		8, /* dispatch_width */
1739		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
1740		inst->base_mrf,
1741		implied_header,
1742		inst->target,
1743		inst->mlen,
1744		0,
1745		eot);
1746}
1747
1748void
1749fs_visitor::generate_linterp(fs_inst *inst,
1750			     struct brw_reg dst, struct brw_reg *src)
1751{
1752   struct brw_reg delta_x = src[0];
1753   struct brw_reg delta_y = src[1];
1754   struct brw_reg interp = src[2];
1755
1756   if (brw->has_pln &&
1757       delta_y.nr == delta_x.nr + 1 &&
1758       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
1759      brw_PLN(p, dst, interp, delta_x);
1760   } else {
1761      brw_LINE(p, brw_null_reg(), interp, delta_x);
1762      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
1763   }
1764}
1765
1766void
1767fs_visitor::generate_math(fs_inst *inst,
1768			  struct brw_reg dst, struct brw_reg *src)
1769{
1770   int op;
1771
1772   switch (inst->opcode) {
1773   case FS_OPCODE_RCP:
1774      op = BRW_MATH_FUNCTION_INV;
1775      break;
1776   case FS_OPCODE_RSQ:
1777      op = BRW_MATH_FUNCTION_RSQ;
1778      break;
1779   case FS_OPCODE_SQRT:
1780      op = BRW_MATH_FUNCTION_SQRT;
1781      break;
1782   case FS_OPCODE_EXP2:
1783      op = BRW_MATH_FUNCTION_EXP;
1784      break;
1785   case FS_OPCODE_LOG2:
1786      op = BRW_MATH_FUNCTION_LOG;
1787      break;
1788   case FS_OPCODE_POW:
1789      op = BRW_MATH_FUNCTION_POW;
1790      break;
1791   case FS_OPCODE_SIN:
1792      op = BRW_MATH_FUNCTION_SIN;
1793      break;
1794   case FS_OPCODE_COS:
1795      op = BRW_MATH_FUNCTION_COS;
1796      break;
1797   default:
1798      assert(!"not reached: unknown math function");
1799      op = 0;
1800      break;
1801   }
1802
1803   if (intel->gen >= 6) {
1804      assert(inst->mlen == 0);
1805
1806      if (inst->opcode == FS_OPCODE_POW) {
1807	 brw_math2(p, dst, op, src[0], src[1]);
1808      } else {
1809	 brw_math(p, dst,
1810		  op,
1811		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
1812		  BRW_MATH_SATURATE_NONE,
1813		  0, src[0],
1814		  BRW_MATH_DATA_VECTOR,
1815		  BRW_MATH_PRECISION_FULL);
1816      }
1817   } else {
1818      assert(inst->mlen >= 1);
1819
1820      brw_math(p, dst,
1821	       op,
1822	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
1823	       BRW_MATH_SATURATE_NONE,
1824	       inst->base_mrf, src[0],
1825	       BRW_MATH_DATA_VECTOR,
1826	       BRW_MATH_PRECISION_FULL);
1827   }
1828}
1829
1830void
1831fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst)
1832{
1833   int msg_type = -1;
1834   int rlen = 4;
1835   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1836
1837   if (intel->gen >= 5) {
1838      switch (inst->opcode) {
1839      case FS_OPCODE_TEX:
1840	 if (inst->shadow_compare) {
1841	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1842	 } else {
1843	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1844	 }
1845	 break;
1846      case FS_OPCODE_TXB:
1847	 if (inst->shadow_compare) {
1848	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE_GEN5;
1849	 } else {
1850	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1851	 }
1852	 break;
1853      }
1854   } else {
1855      switch (inst->opcode) {
1856      case FS_OPCODE_TEX:
1857	 /* Note that G45 and older determines shadow compare and dispatch width
1858	  * from message length for most messages.
1859	  */
1860	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
1861	 if (inst->shadow_compare) {
1862	    assert(inst->mlen == 5);
1863	 } else {
1864	    assert(inst->mlen <= 6);
1865	 }
1866	 break;
1867      case FS_OPCODE_TXB:
1868	 if (inst->shadow_compare) {
1869	    assert(inst->mlen == 5);
1870	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
1871	 } else {
1872	    assert(inst->mlen == 8);
1873	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1874	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1875	 }
1876	 break;
1877      }
1878   }
1879   assert(msg_type != -1);
1880
1881   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
1882      rlen = 8;
1883      dst = vec16(dst);
1884   }
1885
1886   brw_SAMPLE(p,
1887	      retype(dst, BRW_REGISTER_TYPE_UW),
1888	      inst->base_mrf,
1889	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1890              SURF_INDEX_TEXTURE(inst->sampler),
1891	      inst->sampler,
1892	      WRITEMASK_XYZW,
1893	      msg_type,
1894	      rlen,
1895	      inst->mlen,
1896	      0,
1897	      1,
1898	      simd_mode);
1899}
1900
1901
1902/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1903 * looking like:
1904 *
1905 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1906 *
1907 * and we're trying to produce:
1908 *
1909 *           DDX                     DDY
1910 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
1911 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
1912 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
1913 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
1914 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
1915 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
1916 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
1917 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
1918 *
1919 * and add another set of two more subspans if in 16-pixel dispatch mode.
1920 *
1921 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1922 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1923 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
1924 * between each other.  We could probably do it like ddx and swizzle the right
1925 * order later, but bail for now and just produce
1926 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
1927 */
1928void
1929fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
1930{
1931   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
1932				 BRW_REGISTER_TYPE_F,
1933				 BRW_VERTICAL_STRIDE_2,
1934				 BRW_WIDTH_2,
1935				 BRW_HORIZONTAL_STRIDE_0,
1936				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1937   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1938				 BRW_REGISTER_TYPE_F,
1939				 BRW_VERTICAL_STRIDE_2,
1940				 BRW_WIDTH_2,
1941				 BRW_HORIZONTAL_STRIDE_0,
1942				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1943   brw_ADD(p, dst, src0, negate(src1));
1944}
1945
1946void
1947fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
1948{
1949   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1950				 BRW_REGISTER_TYPE_F,
1951				 BRW_VERTICAL_STRIDE_4,
1952				 BRW_WIDTH_4,
1953				 BRW_HORIZONTAL_STRIDE_0,
1954				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1955   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1956				 BRW_REGISTER_TYPE_F,
1957				 BRW_VERTICAL_STRIDE_4,
1958				 BRW_WIDTH_4,
1959				 BRW_HORIZONTAL_STRIDE_0,
1960				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1961   brw_ADD(p, dst, src0, negate(src1));
1962}
1963
1964void
1965fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
1966{
1967   brw_push_insn_state(p);
1968   brw_set_mask_control(p, BRW_MASK_DISABLE);
1969   brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
1970   brw_pop_insn_state(p);
1971}
1972
1973void
1974fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
1975{
1976   struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1977   mask = brw_uw1_reg(mask.file, mask.nr, 0);
1978
1979   brw_push_insn_state(p);
1980   brw_set_mask_control(p, BRW_MASK_DISABLE);
1981   brw_AND(p, g0, mask, g0);
1982   brw_pop_insn_state(p);
1983}
1984
1985void
1986fs_visitor::assign_curb_setup()
1987{
1988   c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
1989   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1990
1991   /* Map the offsets in the UNIFORM file to fixed HW regs. */
1992   foreach_iter(exec_list_iterator, iter, this->instructions) {
1993      fs_inst *inst = (fs_inst *)iter.get();
1994
1995      for (unsigned int i = 0; i < 3; i++) {
1996	 if (inst->src[i].file == UNIFORM) {
1997	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
1998	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
1999						  constant_nr / 8,
2000						  constant_nr % 8);
2001
2002	    inst->src[i].file = FIXED_HW_REG;
2003	    inst->src[i].fixed_hw_reg = brw_reg;
2004	 }
2005      }
2006   }
2007}
2008
2009void
2010fs_visitor::calculate_urb_setup()
2011{
2012   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2013      urb_setup[i] = -1;
2014   }
2015
2016   int urb_next = 0;
2017   /* Figure out where each of the incoming setup attributes lands. */
2018   if (intel->gen >= 6) {
2019      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2020	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2021	    urb_setup[i] = urb_next++;
2022	 }
2023      }
2024   } else {
2025      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2026      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2027	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2028	    int fp_index;
2029
2030	    if (i >= VERT_RESULT_VAR0)
2031	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2032	    else if (i <= VERT_RESULT_TEX7)
2033	       fp_index = i;
2034	    else
2035	       fp_index = -1;
2036
2037	    if (fp_index >= 0)
2038	       urb_setup[fp_index] = urb_next++;
2039	 }
2040      }
2041   }
2042
2043   /* Each attribute is 4 setup channels, each of which is half a reg. */
2044   c->prog_data.urb_read_length = urb_next * 2;
2045}
2046
2047void
2048fs_visitor::assign_urb_setup()
2049{
2050   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2051
2052   /* Offset all the urb_setup[] index by the actual position of the
2053    * setup regs, now that the location of the constants has been chosen.
2054    */
2055   foreach_iter(exec_list_iterator, iter, this->instructions) {
2056      fs_inst *inst = (fs_inst *)iter.get();
2057
2058      if (inst->opcode != FS_OPCODE_LINTERP)
2059	 continue;
2060
2061      assert(inst->src[2].file == FIXED_HW_REG);
2062
2063      inst->src[2].fixed_hw_reg.nr += urb_start;
2064   }
2065
2066   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2067}
2068
2069static void
2070assign_reg(int *reg_hw_locations, fs_reg *reg)
2071{
2072   if (reg->file == GRF && reg->reg != 0) {
2073      reg->hw_reg = reg_hw_locations[reg->reg] + reg->reg_offset;
2074      reg->reg = 0;
2075   }
2076}
2077
2078void
2079fs_visitor::assign_regs_trivial()
2080{
2081   int last_grf = 0;
2082   int hw_reg_mapping[this->virtual_grf_next];
2083   int i;
2084
2085   hw_reg_mapping[0] = 0;
2086   hw_reg_mapping[1] = this->first_non_payload_grf;
2087   for (i = 2; i < this->virtual_grf_next; i++) {
2088      hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
2089			   this->virtual_grf_sizes[i - 1]);
2090   }
2091   last_grf = hw_reg_mapping[i - 1] + this->virtual_grf_sizes[i - 1];
2092
2093   foreach_iter(exec_list_iterator, iter, this->instructions) {
2094      fs_inst *inst = (fs_inst *)iter.get();
2095
2096      assign_reg(hw_reg_mapping, &inst->dst);
2097      assign_reg(hw_reg_mapping, &inst->src[0]);
2098      assign_reg(hw_reg_mapping, &inst->src[1]);
2099   }
2100
2101   this->grf_used = last_grf + 1;
2102}
2103
2104void
2105fs_visitor::assign_regs()
2106{
2107   int last_grf = 0;
2108   int hw_reg_mapping[this->virtual_grf_next + 1];
2109   int base_reg_count = BRW_MAX_GRF - this->first_non_payload_grf;
2110   int class_sizes[base_reg_count];
2111   int class_count = 0;
2112   int aligned_pair_class = -1;
2113
2114   /* Set up the register classes.
2115    *
2116    * The base registers store a scalar value.  For texture samples,
2117    * we get virtual GRFs composed of 4 contiguous hw register.  For
2118    * structures and arrays, we store them as contiguous larger things
2119    * than that, though we should be able to do better most of the
2120    * time.
2121    */
2122   class_sizes[class_count++] = 1;
2123   if (brw->has_pln && intel->gen < 6) {
2124      /* Always set up the (unaligned) pairs for gen5, so we can find
2125       * them for making the aligned pair class.
2126       */
2127      class_sizes[class_count++] = 2;
2128   }
2129   for (int r = 1; r < this->virtual_grf_next; r++) {
2130      int i;
2131
2132      for (i = 0; i < class_count; i++) {
2133	 if (class_sizes[i] == this->virtual_grf_sizes[r])
2134	    break;
2135      }
2136      if (i == class_count) {
2137	 if (this->virtual_grf_sizes[r] >= base_reg_count) {
2138	    fprintf(stderr, "Object too large to register allocate.\n");
2139	    this->fail = true;
2140	 }
2141
2142	 class_sizes[class_count++] = this->virtual_grf_sizes[r];
2143      }
2144   }
2145
2146   int ra_reg_count = 0;
2147   int class_base_reg[class_count];
2148   int class_reg_count[class_count];
2149   int classes[class_count + 1];
2150
2151   for (int i = 0; i < class_count; i++) {
2152      class_base_reg[i] = ra_reg_count;
2153      class_reg_count[i] = base_reg_count - (class_sizes[i] - 1);
2154      ra_reg_count += class_reg_count[i];
2155   }
2156
2157   struct ra_regs *regs = ra_alloc_reg_set(ra_reg_count);
2158   for (int i = 0; i < class_count; i++) {
2159      classes[i] = ra_alloc_reg_class(regs);
2160
2161      for (int i_r = 0; i_r < class_reg_count[i]; i_r++) {
2162	 ra_class_add_reg(regs, classes[i], class_base_reg[i] + i_r);
2163      }
2164
2165      /* Add conflicts between our contiguous registers aliasing
2166       * base regs and other register classes' contiguous registers
2167       * that alias base regs, or the base regs themselves for classes[0].
2168       */
2169      for (int c = 0; c <= i; c++) {
2170	 for (int i_r = 0; i_r < class_reg_count[i]; i_r++) {
2171	    for (int c_r = MAX2(0, i_r - (class_sizes[c] - 1));
2172		 c_r < MIN2(class_reg_count[c], i_r + class_sizes[i]);
2173		 c_r++) {
2174
2175	       if (0) {
2176		  printf("%d/%d conflicts %d/%d\n",
2177			 class_sizes[i], this->first_non_payload_grf + i_r,
2178			 class_sizes[c], this->first_non_payload_grf + c_r);
2179	       }
2180
2181	       ra_add_reg_conflict(regs,
2182				   class_base_reg[i] + i_r,
2183				   class_base_reg[c] + c_r);
2184	    }
2185	 }
2186      }
2187   }
2188
2189   /* Add a special class for aligned pairs, which we'll put delta_x/y
2190    * in on gen5 so that we can do PLN.
2191    */
2192   if (brw->has_pln && intel->gen < 6) {
2193      int reg_count = (base_reg_count - 1) / 2;
2194      int unaligned_pair_class = 1;
2195      assert(class_sizes[unaligned_pair_class] == 2);
2196
2197      aligned_pair_class = class_count;
2198      classes[aligned_pair_class] = ra_alloc_reg_class(regs);
2199      class_base_reg[aligned_pair_class] = 0;
2200      class_reg_count[aligned_pair_class] = 0;
2201      int start = (this->first_non_payload_grf & 1) ? 1 : 0;
2202
2203      for (int i = 0; i < reg_count; i++) {
2204	 ra_class_add_reg(regs, classes[aligned_pair_class],
2205			  class_base_reg[unaligned_pair_class] + i * 2 + start);
2206      }
2207      class_count++;
2208   }
2209
2210   ra_set_finalize(regs);
2211
2212   struct ra_graph *g = ra_alloc_interference_graph(regs,
2213						    this->virtual_grf_next);
2214   /* Node 0 is just a placeholder to keep virtual_grf[] mapping 1:1
2215    * with nodes.
2216    */
2217   ra_set_node_class(g, 0, classes[0]);
2218
2219   for (int i = 1; i < this->virtual_grf_next; i++) {
2220      for (int c = 0; c < class_count; c++) {
2221	 if (class_sizes[c] == this->virtual_grf_sizes[i]) {
2222	    if (aligned_pair_class >= 0 &&
2223		this->delta_x.reg == i) {
2224	       ra_set_node_class(g, i, classes[aligned_pair_class]);
2225	    } else {
2226	       ra_set_node_class(g, i, classes[c]);
2227	    }
2228	    break;
2229	 }
2230      }
2231
2232      for (int j = 1; j < i; j++) {
2233	 if (virtual_grf_interferes(i, j)) {
2234	    ra_add_node_interference(g, i, j);
2235	 }
2236      }
2237   }
2238
2239   /* FINISHME: Handle spilling */
2240   if (!ra_allocate_no_spills(g)) {
2241      fprintf(stderr, "Failed to allocate registers.\n");
2242      this->fail = true;
2243      return;
2244   }
2245
2246   /* Get the chosen virtual registers for each node, and map virtual
2247    * regs in the register classes back down to real hardware reg
2248    * numbers.
2249    */
2250   hw_reg_mapping[0] = 0; /* unused */
2251   for (int i = 1; i < this->virtual_grf_next; i++) {
2252      int reg = ra_get_node_reg(g, i);
2253      int hw_reg = -1;
2254
2255      for (int c = 0; c < class_count; c++) {
2256	 if (reg >= class_base_reg[c] &&
2257	     reg < class_base_reg[c] + class_reg_count[c]) {
2258	    hw_reg = reg - class_base_reg[c];
2259	    break;
2260	 }
2261      }
2262
2263      assert(hw_reg != -1);
2264      hw_reg_mapping[i] = this->first_non_payload_grf + hw_reg;
2265      last_grf = MAX2(last_grf,
2266		      hw_reg_mapping[i] + this->virtual_grf_sizes[i] - 1);
2267   }
2268
2269   foreach_iter(exec_list_iterator, iter, this->instructions) {
2270      fs_inst *inst = (fs_inst *)iter.get();
2271
2272      assign_reg(hw_reg_mapping, &inst->dst);
2273      assign_reg(hw_reg_mapping, &inst->src[0]);
2274      assign_reg(hw_reg_mapping, &inst->src[1]);
2275   }
2276
2277   this->grf_used = last_grf + 1;
2278
2279   talloc_free(g);
2280   talloc_free(regs);
2281}
2282
2283void
2284fs_visitor::calculate_live_intervals()
2285{
2286   int num_vars = this->virtual_grf_next;
2287   int *def = talloc_array(mem_ctx, int, num_vars);
2288   int *use = talloc_array(mem_ctx, int, num_vars);
2289   int loop_depth = 0;
2290   int loop_start = 0;
2291
2292   for (int i = 0; i < num_vars; i++) {
2293      def[i] = 1 << 30;
2294      use[i] = -1;
2295   }
2296
2297   int ip = 0;
2298   foreach_iter(exec_list_iterator, iter, this->instructions) {
2299      fs_inst *inst = (fs_inst *)iter.get();
2300
2301      if (inst->opcode == BRW_OPCODE_DO) {
2302	 if (loop_depth++ == 0)
2303	    loop_start = ip;
2304      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2305	 loop_depth--;
2306
2307	 if (loop_depth == 0) {
2308	    /* FINISHME:
2309	     *
2310	     * Patches up any vars marked for use within the loop as
2311	     * live until the end.  This is conservative, as there
2312	     * will often be variables defined and used inside the
2313	     * loop but dead at the end of the loop body.
2314	     */
2315	    for (int i = 0; i < num_vars; i++) {
2316	       if (use[i] == loop_start) {
2317		  use[i] = ip;
2318	       }
2319	    }
2320	 }
2321      } else {
2322	 int eip = ip;
2323
2324	 if (loop_depth)
2325	    eip = loop_start;
2326
2327	 for (unsigned int i = 0; i < 3; i++) {
2328	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2329	       use[inst->src[i].reg] = MAX2(use[inst->src[i].reg], eip);
2330	    }
2331	 }
2332	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2333	    def[inst->dst.reg] = MIN2(def[inst->dst.reg], eip);
2334	 }
2335      }
2336
2337      ip++;
2338   }
2339
2340   talloc_free(this->virtual_grf_def);
2341   talloc_free(this->virtual_grf_use);
2342   this->virtual_grf_def = def;
2343   this->virtual_grf_use = use;
2344}
2345
2346/**
2347 * Attempts to move immediate constants into the immediate
2348 * constant slot of following instructions.
2349 *
2350 * Immediate constants are a bit tricky -- they have to be in the last
2351 * operand slot, you can't do abs/negate on them,
2352 */
2353
2354bool
2355fs_visitor::propagate_constants()
2356{
2357   bool progress = false;
2358
2359   foreach_iter(exec_list_iterator, iter, this->instructions) {
2360      fs_inst *inst = (fs_inst *)iter.get();
2361
2362      if (inst->opcode != BRW_OPCODE_MOV ||
2363	  inst->predicated ||
2364	  inst->dst.file != GRF || inst->src[0].file != IMM ||
2365	  inst->dst.type != inst->src[0].type)
2366	 continue;
2367
2368      /* Don't bother with cases where we should have had the
2369       * operation on the constant folded in GLSL already.
2370       */
2371      if (inst->saturate)
2372	 continue;
2373
2374      /* Found a move of a constant to a GRF.  Find anything else using the GRF
2375       * before it's written, and replace it with the constant if we can.
2376       */
2377      exec_list_iterator scan_iter = iter;
2378      scan_iter.next();
2379      for (; scan_iter.has_next(); scan_iter.next()) {
2380	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2381
2382	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2383	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2384	     scan_inst->opcode == BRW_OPCODE_ELSE ||
2385	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2386	    break;
2387	 }
2388
2389	 for (int i = 2; i >= 0; i--) {
2390	    if (scan_inst->src[i].file != GRF ||
2391		scan_inst->src[i].reg != inst->dst.reg ||
2392		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2393	       continue;
2394
2395	    /* Don't bother with cases where we should have had the
2396	     * operation on the constant folded in GLSL already.
2397	     */
2398	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2399	       continue;
2400
2401	    switch (scan_inst->opcode) {
2402	    case BRW_OPCODE_MOV:
2403	       scan_inst->src[i] = inst->src[0];
2404	       progress = true;
2405	       break;
2406
2407	    case BRW_OPCODE_MUL:
2408	    case BRW_OPCODE_ADD:
2409	       if (i == 1) {
2410		  scan_inst->src[i] = inst->src[0];
2411		  progress = true;
2412	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
2413		  /* Fit this constant in by commuting the operands */
2414		  scan_inst->src[0] = scan_inst->src[1];
2415		  scan_inst->src[1] = inst->src[0];
2416	       }
2417	       break;
2418	    case BRW_OPCODE_CMP:
2419	       if (i == 1) {
2420		  scan_inst->src[i] = inst->src[0];
2421		  progress = true;
2422	       }
2423	    }
2424	 }
2425
2426	 if (scan_inst->dst.file == GRF &&
2427	     scan_inst->dst.reg == inst->dst.reg &&
2428	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2429	      scan_inst->opcode == FS_OPCODE_TEX)) {
2430	    break;
2431	 }
2432      }
2433   }
2434
2435   return progress;
2436}
2437/**
2438 * Must be called after calculate_live_intervales() to remove unused
2439 * writes to registers -- register allocation will fail otherwise
2440 * because something deffed but not used won't be considered to
2441 * interfere with other regs.
2442 */
2443bool
2444fs_visitor::dead_code_eliminate()
2445{
2446   bool progress = false;
2447   int num_vars = this->virtual_grf_next;
2448   bool dead[num_vars];
2449
2450   for (int i = 0; i < num_vars; i++) {
2451      dead[i] = this->virtual_grf_def[i] >= this->virtual_grf_use[i];
2452
2453      if (dead[i]) {
2454	 /* Mark off its interval so it won't interfere with anything. */
2455	 this->virtual_grf_def[i] = -1;
2456	 this->virtual_grf_use[i] = -1;
2457      }
2458   }
2459
2460   foreach_iter(exec_list_iterator, iter, this->instructions) {
2461      fs_inst *inst = (fs_inst *)iter.get();
2462
2463      if (inst->dst.file == GRF && dead[inst->dst.reg]) {
2464	 inst->remove();
2465	 progress = true;
2466      }
2467   }
2468
2469   return progress;
2470}
2471
2472bool
2473fs_visitor::register_coalesce()
2474{
2475   bool progress = false;
2476
2477   foreach_iter(exec_list_iterator, iter, this->instructions) {
2478      fs_inst *inst = (fs_inst *)iter.get();
2479
2480      if (inst->opcode != BRW_OPCODE_MOV ||
2481	  inst->predicated ||
2482	  inst->saturate ||
2483	  inst->dst.file != GRF || inst->src[0].file != GRF ||
2484	  inst->dst.type != inst->src[0].type)
2485	 continue;
2486
2487      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2488       * them: check for no writes to either one until the exit of the
2489       * program.
2490       */
2491      bool interfered = false;
2492      exec_list_iterator scan_iter = iter;
2493      scan_iter.next();
2494      for (; scan_iter.has_next(); scan_iter.next()) {
2495	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2496
2497	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2498	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2499	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2500	    interfered = true;
2501	    iter = scan_iter;
2502	    break;
2503	 }
2504
2505	 if (scan_inst->dst.file == GRF) {
2506	    if (scan_inst->dst.reg == inst->dst.reg &&
2507		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2508		 scan_inst->opcode == FS_OPCODE_TEX)) {
2509	       interfered = true;
2510	       break;
2511	    }
2512	    if (scan_inst->dst.reg == inst->src[0].reg &&
2513		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
2514		 scan_inst->opcode == FS_OPCODE_TEX)) {
2515	       interfered = true;
2516	       break;
2517	    }
2518	 }
2519      }
2520      if (interfered) {
2521	 continue;
2522      }
2523
2524      /* Rewrite the later usage to point at the source of the move to
2525       * be removed.
2526       */
2527      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
2528	   scan_iter.next()) {
2529	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2530
2531	 for (int i = 0; i < 3; i++) {
2532	    if (scan_inst->src[i].file == GRF &&
2533		scan_inst->src[i].reg == inst->dst.reg &&
2534		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2535	       scan_inst->src[i].reg = inst->src[0].reg;
2536	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
2537	       scan_inst->src[i].abs |= inst->src[0].abs;
2538	       scan_inst->src[i].negate ^= inst->src[0].negate;
2539	    }
2540	 }
2541      }
2542
2543      inst->remove();
2544      progress = true;
2545   }
2546
2547   return progress;
2548}
2549
2550
2551bool
2552fs_visitor::compute_to_mrf()
2553{
2554   bool progress = false;
2555   int next_ip = 0;
2556
2557   foreach_iter(exec_list_iterator, iter, this->instructions) {
2558      fs_inst *inst = (fs_inst *)iter.get();
2559
2560      int ip = next_ip;
2561      next_ip++;
2562
2563      if (inst->opcode != BRW_OPCODE_MOV ||
2564	  inst->predicated ||
2565	  inst->dst.file != MRF || inst->src[0].file != GRF ||
2566	  inst->dst.type != inst->src[0].type ||
2567	  inst->src[0].abs || inst->src[0].negate)
2568	 continue;
2569
2570      /* Can't compute-to-MRF this GRF if someone else was going to
2571       * read it later.
2572       */
2573      if (this->virtual_grf_use[inst->src[0].reg] > ip)
2574	 continue;
2575
2576      /* Found a move of a GRF to a MRF.  Let's see if we can go
2577       * rewrite the thing that made this GRF to write into the MRF.
2578       */
2579      bool found = false;
2580      fs_inst *scan_inst;
2581      for (scan_inst = (fs_inst *)inst->prev;
2582	   scan_inst->prev != NULL;
2583	   scan_inst = (fs_inst *)scan_inst->prev) {
2584	 /* We don't handle flow control here.  Most computation of
2585	  * values that end up in MRFs are shortly before the MRF
2586	  * write anyway.
2587	  */
2588	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2589	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2590	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2591	    break;
2592	 }
2593
2594	 /* You can't read from an MRF, so if someone else reads our
2595	  * MRF's source GRF that we wanted to rewrite, that stops us.
2596	  */
2597	 bool interfered = false;
2598	 for (int i = 0; i < 3; i++) {
2599	    if (scan_inst->src[i].file == GRF &&
2600		scan_inst->src[i].reg == inst->src[0].reg &&
2601		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2602	       interfered = true;
2603	    }
2604	 }
2605	 if (interfered)
2606	    break;
2607
2608	 if (scan_inst->dst.file == MRF &&
2609	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
2610	    /* Somebody else wrote our MRF here, so we can't can't
2611	     * compute-to-MRF before that.
2612	     */
2613	    break;
2614	 }
2615
2616	 if (scan_inst->mlen > 0) {
2617	    /* Found a SEND instruction, which will do some amount of
2618	     * implied write that may overwrite our MRF that we were
2619	     * hoping to compute-to-MRF somewhere above it.  Nothing
2620	     * we have implied-writes more than 2 MRFs from base_mrf,
2621	     * though.
2622	     */
2623	    int implied_write_len = MIN2(scan_inst->mlen, 2);
2624	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
2625		inst->dst.hw_reg < scan_inst->base_mrf + implied_write_len) {
2626	       break;
2627	    }
2628	 }
2629
2630	 if (scan_inst->dst.file == GRF &&
2631	     scan_inst->dst.reg == inst->src[0].reg) {
2632	    /* Found the last thing to write our reg we want to turn
2633	     * into a compute-to-MRF.
2634	     */
2635
2636	    if (scan_inst->opcode == FS_OPCODE_TEX) {
2637	       /* texturing writes several continuous regs, so we can't
2638		* compute-to-mrf that.
2639		*/
2640	       break;
2641	    }
2642
2643	    /* If it's predicated, it (probably) didn't populate all
2644	     * the channels.
2645	     */
2646	    if (scan_inst->predicated)
2647	       break;
2648
2649	    /* SEND instructions can't have MRF as a destination. */
2650	    if (scan_inst->mlen)
2651	       break;
2652
2653	    if (intel->gen >= 6) {
2654	       /* gen6 math instructions must have the destination be
2655		* GRF, so no compute-to-MRF for them.
2656		*/
2657	       if (scan_inst->opcode == FS_OPCODE_RCP ||
2658		   scan_inst->opcode == FS_OPCODE_RSQ ||
2659		   scan_inst->opcode == FS_OPCODE_SQRT ||
2660		   scan_inst->opcode == FS_OPCODE_EXP2 ||
2661		   scan_inst->opcode == FS_OPCODE_LOG2 ||
2662		   scan_inst->opcode == FS_OPCODE_SIN ||
2663		   scan_inst->opcode == FS_OPCODE_COS ||
2664		   scan_inst->opcode == FS_OPCODE_POW) {
2665		  break;
2666	       }
2667	    }
2668
2669	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2670	       /* Found the creator of our MRF's source value. */
2671	       found = true;
2672	       break;
2673	    }
2674	 }
2675      }
2676      if (found) {
2677	 scan_inst->dst.file = MRF;
2678	 scan_inst->dst.hw_reg = inst->dst.hw_reg;
2679	 scan_inst->saturate |= inst->saturate;
2680	 inst->remove();
2681	 progress = true;
2682      }
2683   }
2684
2685   return progress;
2686}
2687
2688bool
2689fs_visitor::virtual_grf_interferes(int a, int b)
2690{
2691   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
2692   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
2693
2694   /* For dead code, just check if the def interferes with the other range. */
2695   if (this->virtual_grf_use[a] == -1) {
2696      return (this->virtual_grf_def[a] >= this->virtual_grf_def[b] &&
2697	      this->virtual_grf_def[a] < this->virtual_grf_use[b]);
2698   }
2699   if (this->virtual_grf_use[b] == -1) {
2700      return (this->virtual_grf_def[b] >= this->virtual_grf_def[a] &&
2701	      this->virtual_grf_def[b] < this->virtual_grf_use[a]);
2702   }
2703
2704   return start < end;
2705}
2706
2707static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
2708{
2709   struct brw_reg brw_reg;
2710
2711   switch (reg->file) {
2712   case GRF:
2713   case ARF:
2714   case MRF:
2715      brw_reg = brw_vec8_reg(reg->file,
2716			    reg->hw_reg, 0);
2717      brw_reg = retype(brw_reg, reg->type);
2718      break;
2719   case IMM:
2720      switch (reg->type) {
2721      case BRW_REGISTER_TYPE_F:
2722	 brw_reg = brw_imm_f(reg->imm.f);
2723	 break;
2724      case BRW_REGISTER_TYPE_D:
2725	 brw_reg = brw_imm_d(reg->imm.i);
2726	 break;
2727      case BRW_REGISTER_TYPE_UD:
2728	 brw_reg = brw_imm_ud(reg->imm.u);
2729	 break;
2730      default:
2731	 assert(!"not reached");
2732	 break;
2733      }
2734      break;
2735   case FIXED_HW_REG:
2736      brw_reg = reg->fixed_hw_reg;
2737      break;
2738   case BAD_FILE:
2739      /* Probably unused. */
2740      brw_reg = brw_null_reg();
2741      break;
2742   case UNIFORM:
2743      assert(!"not reached");
2744      brw_reg = brw_null_reg();
2745      break;
2746   }
2747   if (reg->abs)
2748      brw_reg = brw_abs(brw_reg);
2749   if (reg->negate)
2750      brw_reg = negate(brw_reg);
2751
2752   return brw_reg;
2753}
2754
2755void
2756fs_visitor::generate_code()
2757{
2758   unsigned int annotation_len = 0;
2759   int last_native_inst = 0;
2760   struct brw_instruction *if_stack[16], *loop_stack[16];
2761   int if_stack_depth = 0, loop_stack_depth = 0;
2762   int if_depth_in_loop[16];
2763
2764   if_depth_in_loop[loop_stack_depth] = 0;
2765
2766   memset(&if_stack, 0, sizeof(if_stack));
2767   foreach_iter(exec_list_iterator, iter, this->instructions) {
2768      fs_inst *inst = (fs_inst *)iter.get();
2769      struct brw_reg src[3], dst;
2770
2771      for (unsigned int i = 0; i < 3; i++) {
2772	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
2773      }
2774      dst = brw_reg_from_fs_reg(&inst->dst);
2775
2776      brw_set_conditionalmod(p, inst->conditional_mod);
2777      brw_set_predicate_control(p, inst->predicated);
2778
2779      switch (inst->opcode) {
2780      case BRW_OPCODE_MOV:
2781	 brw_MOV(p, dst, src[0]);
2782	 break;
2783      case BRW_OPCODE_ADD:
2784	 brw_ADD(p, dst, src[0], src[1]);
2785	 break;
2786      case BRW_OPCODE_MUL:
2787	 brw_MUL(p, dst, src[0], src[1]);
2788	 break;
2789
2790      case BRW_OPCODE_FRC:
2791	 brw_FRC(p, dst, src[0]);
2792	 break;
2793      case BRW_OPCODE_RNDD:
2794	 brw_RNDD(p, dst, src[0]);
2795	 break;
2796      case BRW_OPCODE_RNDZ:
2797	 brw_RNDZ(p, dst, src[0]);
2798	 break;
2799
2800      case BRW_OPCODE_AND:
2801	 brw_AND(p, dst, src[0], src[1]);
2802	 break;
2803      case BRW_OPCODE_OR:
2804	 brw_OR(p, dst, src[0], src[1]);
2805	 break;
2806      case BRW_OPCODE_XOR:
2807	 brw_XOR(p, dst, src[0], src[1]);
2808	 break;
2809      case BRW_OPCODE_NOT:
2810	 brw_NOT(p, dst, src[0]);
2811	 break;
2812      case BRW_OPCODE_ASR:
2813	 brw_ASR(p, dst, src[0], src[1]);
2814	 break;
2815      case BRW_OPCODE_SHR:
2816	 brw_SHR(p, dst, src[0], src[1]);
2817	 break;
2818      case BRW_OPCODE_SHL:
2819	 brw_SHL(p, dst, src[0], src[1]);
2820	 break;
2821
2822      case BRW_OPCODE_CMP:
2823	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
2824	 break;
2825      case BRW_OPCODE_SEL:
2826	 brw_SEL(p, dst, src[0], src[1]);
2827	 break;
2828
2829      case BRW_OPCODE_IF:
2830	 assert(if_stack_depth < 16);
2831	 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
2832	 if_depth_in_loop[loop_stack_depth]++;
2833	 if_stack_depth++;
2834	 break;
2835      case BRW_OPCODE_ELSE:
2836	 if_stack[if_stack_depth - 1] =
2837	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
2838	 break;
2839      case BRW_OPCODE_ENDIF:
2840	 if_stack_depth--;
2841	 brw_ENDIF(p , if_stack[if_stack_depth]);
2842	 if_depth_in_loop[loop_stack_depth]--;
2843	 break;
2844
2845      case BRW_OPCODE_DO:
2846	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
2847	 if_depth_in_loop[loop_stack_depth] = 0;
2848	 break;
2849
2850      case BRW_OPCODE_BREAK:
2851	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
2852	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2853	 break;
2854      case BRW_OPCODE_CONTINUE:
2855	 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
2856	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2857	 break;
2858
2859      case BRW_OPCODE_WHILE: {
2860	 struct brw_instruction *inst0, *inst1;
2861	 GLuint br = 1;
2862
2863	 if (intel->gen >= 5)
2864	    br = 2;
2865
2866	 assert(loop_stack_depth > 0);
2867	 loop_stack_depth--;
2868	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
2869	 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2870	 while (inst0 > loop_stack[loop_stack_depth]) {
2871	    inst0--;
2872	    if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2873		inst0->bits3.if_else.jump_count == 0) {
2874	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2875	    }
2876	    else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2877		     inst0->bits3.if_else.jump_count == 0) {
2878	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2879	    }
2880	 }
2881      }
2882	 break;
2883
2884      case FS_OPCODE_RCP:
2885      case FS_OPCODE_RSQ:
2886      case FS_OPCODE_SQRT:
2887      case FS_OPCODE_EXP2:
2888      case FS_OPCODE_LOG2:
2889      case FS_OPCODE_POW:
2890      case FS_OPCODE_SIN:
2891      case FS_OPCODE_COS:
2892	 generate_math(inst, dst, src);
2893	 break;
2894      case FS_OPCODE_LINTERP:
2895	 generate_linterp(inst, dst, src);
2896	 break;
2897      case FS_OPCODE_TEX:
2898      case FS_OPCODE_TXB:
2899      case FS_OPCODE_TXL:
2900	 generate_tex(inst, dst);
2901	 break;
2902      case FS_OPCODE_DISCARD_NOT:
2903	 generate_discard_not(inst, dst);
2904	 break;
2905      case FS_OPCODE_DISCARD_AND:
2906	 generate_discard_and(inst, src[0]);
2907	 break;
2908      case FS_OPCODE_DDX:
2909	 generate_ddx(inst, dst, src[0]);
2910	 break;
2911      case FS_OPCODE_DDY:
2912	 generate_ddy(inst, dst, src[0]);
2913	 break;
2914      case FS_OPCODE_FB_WRITE:
2915	 generate_fb_write(inst);
2916	 break;
2917      default:
2918	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
2919	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
2920			  brw_opcodes[inst->opcode].name);
2921	 } else {
2922	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
2923	 }
2924	 this->fail = true;
2925      }
2926
2927      if (annotation_len < p->nr_insn) {
2928	 annotation_len *= 2;
2929	 if (annotation_len < 16)
2930	    annotation_len = 16;
2931
2932	 this->annotation_string = talloc_realloc(this->mem_ctx,
2933						  annotation_string,
2934						  const char *,
2935						  annotation_len);
2936	 this->annotation_ir = talloc_realloc(this->mem_ctx,
2937					      annotation_ir,
2938					      ir_instruction *,
2939					      annotation_len);
2940      }
2941
2942      for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
2943	 this->annotation_string[i] = inst->annotation;
2944	 this->annotation_ir[i] = inst->ir;
2945      }
2946      last_native_inst = p->nr_insn;
2947   }
2948}
2949
2950GLboolean
2951brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
2952{
2953   struct brw_compile *p = &c->func;
2954   struct intel_context *intel = &brw->intel;
2955   GLcontext *ctx = &intel->ctx;
2956   struct brw_shader *shader = NULL;
2957   struct gl_shader_program *prog = ctx->Shader.CurrentProgram;
2958
2959   if (!prog)
2960      return GL_FALSE;
2961
2962   if (!using_new_fs)
2963      return GL_FALSE;
2964
2965   for (unsigned int i = 0; i < prog->_NumLinkedShaders; i++) {
2966      if (prog->_LinkedShaders[i]->Type == GL_FRAGMENT_SHADER) {
2967	 shader = (struct brw_shader *)prog->_LinkedShaders[i];
2968	 break;
2969      }
2970   }
2971   if (!shader)
2972      return GL_FALSE;
2973
2974   /* We always use 8-wide mode, at least for now.  For one, flow
2975    * control only works in 8-wide.  Also, when we're fragment shader
2976    * bound, we're almost always under register pressure as well, so
2977    * 8-wide would save us from the performance cliff of spilling
2978    * regs.
2979    */
2980   c->dispatch_width = 8;
2981
2982   if (INTEL_DEBUG & DEBUG_WM) {
2983      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2984      _mesa_print_ir(shader->ir, NULL);
2985      printf("\n");
2986   }
2987
2988   /* Now the main event: Visit the shader IR and generate our FS IR for it.
2989    */
2990   fs_visitor v(c, shader);
2991
2992   if (0) {
2993      v.emit_dummy_fs();
2994   } else {
2995      v.calculate_urb_setup();
2996      if (intel->gen < 6)
2997	 v.emit_interpolation_setup_gen4();
2998      else
2999	 v.emit_interpolation_setup_gen6();
3000
3001      /* Generate FS IR for main().  (the visitor only descends into
3002       * functions called "main").
3003       */
3004      foreach_iter(exec_list_iterator, iter, *shader->ir) {
3005	 ir_instruction *ir = (ir_instruction *)iter.get();
3006	 v.base_ir = ir;
3007	 ir->accept(&v);
3008      }
3009
3010      v.emit_fb_writes();
3011      v.assign_curb_setup();
3012      v.assign_urb_setup();
3013
3014      bool progress;
3015      do {
3016	 progress = false;
3017
3018	 v.calculate_live_intervals();
3019	 progress = v.propagate_constants() || progress;
3020	 progress = v.register_coalesce() || progress;
3021	 progress = v.compute_to_mrf() || progress;
3022	 progress = v.dead_code_eliminate() || progress;
3023      } while (progress);
3024
3025      if (0)
3026	 v.assign_regs_trivial();
3027      else
3028	 v.assign_regs();
3029   }
3030
3031   if (!v.fail)
3032      v.generate_code();
3033
3034   assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
3035
3036   if (v.fail)
3037      return GL_FALSE;
3038
3039   if (INTEL_DEBUG & DEBUG_WM) {
3040      const char *last_annotation_string = NULL;
3041      ir_instruction *last_annotation_ir = NULL;
3042
3043      printf("Native code for fragment shader %d:\n", prog->Name);
3044      for (unsigned int i = 0; i < p->nr_insn; i++) {
3045	 if (last_annotation_ir != v.annotation_ir[i]) {
3046	    last_annotation_ir = v.annotation_ir[i];
3047	    if (last_annotation_ir) {
3048	       printf("   ");
3049	       last_annotation_ir->print();
3050	       printf("\n");
3051	    }
3052	 }
3053	 if (last_annotation_string != v.annotation_string[i]) {
3054	    last_annotation_string = v.annotation_string[i];
3055	    if (last_annotation_string)
3056	       printf("   %s\n", last_annotation_string);
3057	 }
3058	 brw_disasm(stdout, &p->store[i], intel->gen);
3059      }
3060      printf("\n");
3061   }
3062
3063   c->prog_data.total_grf = v.grf_used;
3064   c->prog_data.total_scratch = 0;
3065
3066   return GL_TRUE;
3067}
3068