brw_fs.cpp revision c6dbf253d284f68b0d0e4a3c145583880855324b
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44#include "talloc.h"
45}
46#include "brw_fs.h"
47#include "../glsl/glsl_types.h"
48#include "../glsl/ir_optimization.h"
49#include "../glsl/ir_print_visitor.h"
50
51static int using_new_fs = -1;
52static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
53
54struct gl_shader *
55brw_new_shader(GLcontext *ctx, GLuint name, GLuint type)
56{
57   struct brw_shader *shader;
58
59   shader = talloc_zero(NULL, struct brw_shader);
60   if (shader) {
61      shader->base.Type = type;
62      shader->base.Name = name;
63      _mesa_init_shader(ctx, &shader->base);
64   }
65
66   return &shader->base;
67}
68
69struct gl_shader_program *
70brw_new_shader_program(GLcontext *ctx, GLuint name)
71{
72   struct brw_shader_program *prog;
73   prog = talloc_zero(NULL, struct brw_shader_program);
74   if (prog) {
75      prog->base.Name = name;
76      _mesa_init_shader_program(ctx, &prog->base);
77   }
78   return &prog->base;
79}
80
81GLboolean
82brw_compile_shader(GLcontext *ctx, struct gl_shader *shader)
83{
84   if (!_mesa_ir_compile_shader(ctx, shader))
85      return GL_FALSE;
86
87   return GL_TRUE;
88}
89
90GLboolean
91brw_link_shader(GLcontext *ctx, struct gl_shader_program *prog)
92{
93   struct intel_context *intel = intel_context(ctx);
94   if (using_new_fs == -1)
95      using_new_fs = getenv("INTEL_NEW_FS") != NULL;
96
97   for (unsigned i = 0; i < prog->_NumLinkedShaders; i++) {
98      struct brw_shader *shader = (struct brw_shader *)prog->_LinkedShaders[i];
99
100      if (using_new_fs && shader->base.Type == GL_FRAGMENT_SHADER) {
101	 void *mem_ctx = talloc_new(NULL);
102	 bool progress;
103
104	 if (shader->ir)
105	    talloc_free(shader->ir);
106	 shader->ir = new(shader) exec_list;
107	 clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
108
109	 do_mat_op_to_vec(shader->ir);
110	 do_mod_to_fract(shader->ir);
111	 do_div_to_mul_rcp(shader->ir);
112	 do_sub_to_add_neg(shader->ir);
113	 do_explog_to_explog2(shader->ir);
114	 do_lower_texture_projection(shader->ir);
115	 brw_do_cubemap_normalize(shader->ir);
116
117	 do {
118	    progress = false;
119
120	    brw_do_channel_expressions(shader->ir);
121	    brw_do_vector_splitting(shader->ir);
122
123	    progress = do_lower_jumps(shader->ir, true, true,
124				      true, /* main return */
125				      false, /* continue */
126				      false /* loops */
127				      ) || progress;
128
129	    progress = do_common_optimization(shader->ir, true, 32) || progress;
130
131	    progress = lower_noise(shader->ir) || progress;
132	    progress =
133	       lower_variable_index_to_cond_assign(shader->ir,
134						   GL_TRUE, /* input */
135						   GL_TRUE, /* output */
136						   GL_TRUE, /* temp */
137						   GL_TRUE /* uniform */
138						   ) || progress;
139	    if (intel->gen == 6) {
140	       progress = do_if_to_cond_assign(shader->ir) || progress;
141	    }
142	 } while (progress);
143
144	 validate_ir_tree(shader->ir);
145
146	 reparent_ir(shader->ir, shader->ir);
147	 talloc_free(mem_ctx);
148      }
149   }
150
151   if (!_mesa_ir_link_shader(ctx, prog))
152      return GL_FALSE;
153
154   return GL_TRUE;
155}
156
157static int
158type_size(const struct glsl_type *type)
159{
160   unsigned int size, i;
161
162   switch (type->base_type) {
163   case GLSL_TYPE_UINT:
164   case GLSL_TYPE_INT:
165   case GLSL_TYPE_FLOAT:
166   case GLSL_TYPE_BOOL:
167      return type->components();
168   case GLSL_TYPE_ARRAY:
169      return type_size(type->fields.array) * type->length;
170   case GLSL_TYPE_STRUCT:
171      size = 0;
172      for (i = 0; i < type->length; i++) {
173	 size += type_size(type->fields.structure[i].type);
174      }
175      return size;
176   case GLSL_TYPE_SAMPLER:
177      /* Samplers take up no register space, since they're baked in at
178       * link time.
179       */
180      return 0;
181   default:
182      assert(!"not reached");
183      return 0;
184   }
185}
186
187static const fs_reg reg_undef;
188static const fs_reg reg_null(ARF, BRW_ARF_NULL);
189
190int
191fs_visitor::virtual_grf_alloc(int size)
192{
193   if (virtual_grf_array_size <= virtual_grf_next) {
194      if (virtual_grf_array_size == 0)
195	 virtual_grf_array_size = 16;
196      else
197	 virtual_grf_array_size *= 2;
198      virtual_grf_sizes = talloc_realloc(mem_ctx, virtual_grf_sizes,
199					 int, virtual_grf_array_size);
200
201      /* This slot is always unused. */
202      virtual_grf_sizes[0] = 0;
203   }
204   virtual_grf_sizes[virtual_grf_next] = size;
205   return virtual_grf_next++;
206}
207
208/** Fixed HW reg constructor. */
209fs_reg::fs_reg(enum register_file file, int hw_reg)
210{
211   init();
212   this->file = file;
213   this->hw_reg = hw_reg;
214   this->type = BRW_REGISTER_TYPE_F;
215}
216
217int
218brw_type_for_base_type(const struct glsl_type *type)
219{
220   switch (type->base_type) {
221   case GLSL_TYPE_FLOAT:
222      return BRW_REGISTER_TYPE_F;
223   case GLSL_TYPE_INT:
224   case GLSL_TYPE_BOOL:
225      return BRW_REGISTER_TYPE_D;
226   case GLSL_TYPE_UINT:
227      return BRW_REGISTER_TYPE_UD;
228   case GLSL_TYPE_ARRAY:
229   case GLSL_TYPE_STRUCT:
230      /* These should be overridden with the type of the member when
231       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
232       * way to trip up if we don't.
233       */
234      return BRW_REGISTER_TYPE_UD;
235   default:
236      assert(!"not reached");
237      return BRW_REGISTER_TYPE_F;
238   }
239}
240
241/** Automatic reg constructor. */
242fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
243{
244   init();
245
246   this->file = GRF;
247   this->reg = v->virtual_grf_alloc(type_size(type));
248   this->reg_offset = 0;
249   this->type = brw_type_for_base_type(type);
250}
251
252fs_reg *
253fs_visitor::variable_storage(ir_variable *var)
254{
255   return (fs_reg *)hash_table_find(this->variable_ht, var);
256}
257
258/* Our support for uniforms is piggy-backed on the struct
259 * gl_fragment_program, because that's where the values actually
260 * get stored, rather than in some global gl_shader_program uniform
261 * store.
262 */
263int
264fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
265{
266   unsigned int offset = 0;
267   float *vec_values;
268
269   if (type->is_matrix()) {
270      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
271							type->vector_elements,
272							1);
273
274      for (unsigned int i = 0; i < type->matrix_columns; i++) {
275	 offset += setup_uniform_values(loc + offset, column);
276      }
277
278      return offset;
279   }
280
281   switch (type->base_type) {
282   case GLSL_TYPE_FLOAT:
283   case GLSL_TYPE_UINT:
284   case GLSL_TYPE_INT:
285   case GLSL_TYPE_BOOL:
286      vec_values = fp->Base.Parameters->ParameterValues[loc];
287      for (unsigned int i = 0; i < type->vector_elements; i++) {
288	 c->prog_data.param[c->prog_data.nr_params++] = &vec_values[i];
289      }
290      return 1;
291
292   case GLSL_TYPE_STRUCT:
293      for (unsigned int i = 0; i < type->length; i++) {
294	 offset += setup_uniform_values(loc + offset,
295					type->fields.structure[i].type);
296      }
297      return offset;
298
299   case GLSL_TYPE_ARRAY:
300      for (unsigned int i = 0; i < type->length; i++) {
301	 offset += setup_uniform_values(loc + offset, type->fields.array);
302      }
303      return offset;
304
305   case GLSL_TYPE_SAMPLER:
306      /* The sampler takes up a slot, but we don't use any values from it. */
307      return 1;
308
309   default:
310      assert(!"not reached");
311      return 0;
312   }
313}
314
315
316/* Our support for builtin uniforms is even scarier than non-builtin.
317 * It sits on top of the PROG_STATE_VAR parameters that are
318 * automatically updated from GL context state.
319 */
320void
321fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
322{
323   const struct gl_builtin_uniform_desc *statevar = NULL;
324
325   for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
326      statevar = &_mesa_builtin_uniform_desc[i];
327      if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
328	 break;
329   }
330
331   if (!statevar->name) {
332      this->fail = true;
333      printf("Failed to find builtin uniform `%s'\n", ir->name);
334      return;
335   }
336
337   int array_count;
338   if (ir->type->is_array()) {
339      array_count = ir->type->length;
340   } else {
341      array_count = 1;
342   }
343
344   for (int a = 0; a < array_count; a++) {
345      for (unsigned int i = 0; i < statevar->num_elements; i++) {
346	 struct gl_builtin_uniform_element *element = &statevar->elements[i];
347	 int tokens[STATE_LENGTH];
348
349	 memcpy(tokens, element->tokens, sizeof(element->tokens));
350	 if (ir->type->is_array()) {
351	    tokens[1] = a;
352	 }
353
354	 /* This state reference has already been setup by ir_to_mesa,
355	  * but we'll get the same index back here.
356	  */
357	 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
358					       (gl_state_index *)tokens);
359	 float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
360
361	 /* Add each of the unique swizzles of the element as a
362	  * parameter.  This'll end up matching the expected layout of
363	  * the array/matrix/structure we're trying to fill in.
364	  */
365	 int last_swiz = -1;
366	 for (unsigned int i = 0; i < 4; i++) {
367	    int swiz = GET_SWZ(element->swizzle, i);
368	    if (swiz == last_swiz)
369	       break;
370	    last_swiz = swiz;
371
372	    c->prog_data.param[c->prog_data.nr_params++] = &vec_values[swiz];
373	 }
374      }
375   }
376}
377
378fs_reg *
379fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
380{
381   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
382   fs_reg wpos = *reg;
383   fs_reg neg_y = this->pixel_y;
384   neg_y.negate = true;
385
386   /* gl_FragCoord.x */
387   if (ir->pixel_center_integer) {
388      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x));
389   } else {
390      emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)));
391   }
392   wpos.reg_offset++;
393
394   /* gl_FragCoord.y */
395   if (ir->origin_upper_left && ir->pixel_center_integer) {
396      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y));
397   } else {
398      fs_reg pixel_y = this->pixel_y;
399      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
400
401      if (!ir->origin_upper_left) {
402	 pixel_y.negate = true;
403	 offset += c->key.drawable_height - 1.0;
404      }
405
406      emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)));
407   }
408   wpos.reg_offset++;
409
410   /* gl_FragCoord.z */
411   emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
412		interp_reg(FRAG_ATTRIB_WPOS, 2)));
413   wpos.reg_offset++;
414
415   /* gl_FragCoord.w: Already set up in emit_interpolation */
416   emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w));
417
418   return reg;
419}
420
421fs_reg *
422fs_visitor::emit_general_interpolation(ir_variable *ir)
423{
424   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
425   /* Interpolation is always in floating point regs. */
426   reg->type = BRW_REGISTER_TYPE_F;
427   fs_reg attr = *reg;
428
429   unsigned int array_elements;
430   const glsl_type *type;
431
432   if (ir->type->is_array()) {
433      array_elements = ir->type->length;
434      if (array_elements == 0) {
435	 this->fail = true;
436      }
437      type = ir->type->fields.array;
438   } else {
439      array_elements = 1;
440      type = ir->type;
441   }
442
443   int location = ir->location;
444   for (unsigned int i = 0; i < array_elements; i++) {
445      for (unsigned int j = 0; j < type->matrix_columns; j++) {
446	 if (urb_setup[location] == -1) {
447	    /* If there's no incoming setup data for this slot, don't
448	     * emit interpolation for it.
449	     */
450	    attr.reg_offset += type->vector_elements;
451	    location++;
452	    continue;
453	 }
454
455	 for (unsigned int c = 0; c < type->vector_elements; c++) {
456	    struct brw_reg interp = interp_reg(location, c);
457	    emit(fs_inst(FS_OPCODE_LINTERP,
458			 attr,
459			 this->delta_x,
460			 this->delta_y,
461			 fs_reg(interp)));
462	    attr.reg_offset++;
463	 }
464
465	 if (intel->gen < 6) {
466	    attr.reg_offset -= type->vector_elements;
467	    for (unsigned int c = 0; c < type->vector_elements; c++) {
468	       emit(fs_inst(BRW_OPCODE_MUL,
469			    attr,
470			    attr,
471			    this->pixel_w));
472	       attr.reg_offset++;
473	    }
474	 }
475	 location++;
476      }
477   }
478
479   return reg;
480}
481
482fs_reg *
483fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
484{
485   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
486
487   /* The frontfacing comes in as a bit in the thread payload. */
488   if (intel->gen >= 6) {
489      emit(fs_inst(BRW_OPCODE_ASR,
490		   *reg,
491		   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
492		   fs_reg(15)));
493      emit(fs_inst(BRW_OPCODE_NOT,
494		   *reg,
495		   *reg));
496      emit(fs_inst(BRW_OPCODE_AND,
497		   *reg,
498		   *reg,
499		   fs_reg(1)));
500   } else {
501      fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
502      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
503      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
504       * us front face
505       */
506      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP,
507				   *reg,
508				   fs_reg(r1_6ud),
509				   fs_reg(1u << 31)));
510      inst->conditional_mod = BRW_CONDITIONAL_L;
511      emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)));
512   }
513
514   return reg;
515}
516
517fs_inst *
518fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
519{
520   switch (opcode) {
521   case FS_OPCODE_RCP:
522   case FS_OPCODE_RSQ:
523   case FS_OPCODE_SQRT:
524   case FS_OPCODE_EXP2:
525   case FS_OPCODE_LOG2:
526   case FS_OPCODE_SIN:
527   case FS_OPCODE_COS:
528      break;
529   default:
530      assert(!"not reached: bad math opcode");
531      return NULL;
532   }
533   fs_inst *inst = emit(fs_inst(opcode, dst, src));
534
535   inst->base_mrf = 2;
536   inst->mlen = 1;
537
538   return inst;
539}
540
541fs_inst *
542fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
543{
544   assert(opcode == FS_OPCODE_POW);
545
546   fs_inst *inst = emit(fs_inst(opcode, dst, src0, src1));
547
548   inst->base_mrf = 2;
549   inst->mlen = 2;
550
551   return inst;
552}
553
554void
555fs_visitor::visit(ir_variable *ir)
556{
557   fs_reg *reg = NULL;
558
559   if (variable_storage(ir))
560      return;
561
562   if (strcmp(ir->name, "gl_FragColor") == 0) {
563      this->frag_color = ir;
564   } else if (strcmp(ir->name, "gl_FragData") == 0) {
565      this->frag_data = ir;
566   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
567      this->frag_depth = ir;
568   }
569
570   if (ir->mode == ir_var_in) {
571      if (!strcmp(ir->name, "gl_FragCoord")) {
572	 reg = emit_fragcoord_interpolation(ir);
573      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
574	 reg = emit_frontfacing_interpolation(ir);
575      } else {
576	 reg = emit_general_interpolation(ir);
577      }
578      assert(reg);
579      hash_table_insert(this->variable_ht, reg, ir);
580      return;
581   }
582
583   if (ir->mode == ir_var_uniform) {
584      int param_index = c->prog_data.nr_params;
585
586      if (!strncmp(ir->name, "gl_", 3)) {
587	 setup_builtin_uniform_values(ir);
588      } else {
589	 setup_uniform_values(ir->location, ir->type);
590      }
591
592      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
593   }
594
595   if (!reg)
596      reg = new(this->mem_ctx) fs_reg(this, ir->type);
597
598   hash_table_insert(this->variable_ht, reg, ir);
599}
600
601void
602fs_visitor::visit(ir_dereference_variable *ir)
603{
604   fs_reg *reg = variable_storage(ir->var);
605   this->result = *reg;
606}
607
608void
609fs_visitor::visit(ir_dereference_record *ir)
610{
611   const glsl_type *struct_type = ir->record->type;
612
613   ir->record->accept(this);
614
615   unsigned int offset = 0;
616   for (unsigned int i = 0; i < struct_type->length; i++) {
617      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
618	 break;
619      offset += type_size(struct_type->fields.structure[i].type);
620   }
621   this->result.reg_offset += offset;
622   this->result.type = brw_type_for_base_type(ir->type);
623}
624
625void
626fs_visitor::visit(ir_dereference_array *ir)
627{
628   ir_constant *index;
629   int element_size;
630
631   ir->array->accept(this);
632   index = ir->array_index->as_constant();
633
634   element_size = type_size(ir->type);
635   this->result.type = brw_type_for_base_type(ir->type);
636
637   if (index) {
638      assert(this->result.file == UNIFORM ||
639	     (this->result.file == GRF &&
640	      this->result.reg != 0));
641      this->result.reg_offset += index->value.i[0] * element_size;
642   } else {
643      assert(!"FINISHME: non-constant array element");
644   }
645}
646
647void
648fs_visitor::visit(ir_expression *ir)
649{
650   unsigned int operand;
651   fs_reg op[2], temp;
652   fs_reg result;
653   fs_inst *inst;
654
655   for (operand = 0; operand < ir->get_num_operands(); operand++) {
656      ir->operands[operand]->accept(this);
657      if (this->result.file == BAD_FILE) {
658	 ir_print_visitor v;
659	 printf("Failed to get tree for expression operand:\n");
660	 ir->operands[operand]->accept(&v);
661	 this->fail = true;
662      }
663      op[operand] = this->result;
664
665      /* Matrix expression operands should have been broken down to vector
666       * operations already.
667       */
668      assert(!ir->operands[operand]->type->is_matrix());
669      /* And then those vector operands should have been broken down to scalar.
670       */
671      assert(!ir->operands[operand]->type->is_vector());
672   }
673
674   /* Storage for our result.  If our result goes into an assignment, it will
675    * just get copy-propagated out, so no worries.
676    */
677   this->result = fs_reg(this, ir->type);
678
679   switch (ir->operation) {
680   case ir_unop_logic_not:
681      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], fs_reg(-1)));
682      break;
683   case ir_unop_neg:
684      op[0].negate = !op[0].negate;
685      this->result = op[0];
686      break;
687   case ir_unop_abs:
688      op[0].abs = true;
689      this->result = op[0];
690      break;
691   case ir_unop_sign:
692      temp = fs_reg(this, ir->type);
693
694      emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)));
695
696      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null, op[0], fs_reg(0.0f)));
697      inst->conditional_mod = BRW_CONDITIONAL_G;
698      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)));
699      inst->predicated = true;
700
701      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null, op[0], fs_reg(0.0f)));
702      inst->conditional_mod = BRW_CONDITIONAL_L;
703      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)));
704      inst->predicated = true;
705
706      break;
707   case ir_unop_rcp:
708      emit_math(FS_OPCODE_RCP, this->result, op[0]);
709      break;
710
711   case ir_unop_exp2:
712      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
713      break;
714   case ir_unop_log2:
715      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
716      break;
717   case ir_unop_exp:
718   case ir_unop_log:
719      assert(!"not reached: should be handled by ir_explog_to_explog2");
720      break;
721   case ir_unop_sin:
722      emit_math(FS_OPCODE_SIN, this->result, op[0]);
723      break;
724   case ir_unop_cos:
725      emit_math(FS_OPCODE_COS, this->result, op[0]);
726      break;
727
728   case ir_unop_dFdx:
729      emit(fs_inst(FS_OPCODE_DDX, this->result, op[0]));
730      break;
731   case ir_unop_dFdy:
732      emit(fs_inst(FS_OPCODE_DDY, this->result, op[0]));
733      break;
734
735   case ir_binop_add:
736      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1]));
737      break;
738   case ir_binop_sub:
739      assert(!"not reached: should be handled by ir_sub_to_add_neg");
740      break;
741
742   case ir_binop_mul:
743      emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1]));
744      break;
745   case ir_binop_div:
746      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
747      break;
748   case ir_binop_mod:
749      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
750      break;
751
752   case ir_binop_less:
753      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
754      inst->conditional_mod = BRW_CONDITIONAL_L;
755      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
756      break;
757   case ir_binop_greater:
758      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
759      inst->conditional_mod = BRW_CONDITIONAL_G;
760      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
761      break;
762   case ir_binop_lequal:
763      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
764      inst->conditional_mod = BRW_CONDITIONAL_LE;
765      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
766      break;
767   case ir_binop_gequal:
768      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
769      inst->conditional_mod = BRW_CONDITIONAL_GE;
770      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
771      break;
772   case ir_binop_equal:
773   case ir_binop_all_equal: /* same as nequal for scalars */
774      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
775      inst->conditional_mod = BRW_CONDITIONAL_Z;
776      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
777      break;
778   case ir_binop_nequal:
779   case ir_binop_any_nequal: /* same as nequal for scalars */
780      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
781      inst->conditional_mod = BRW_CONDITIONAL_NZ;
782      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
783      break;
784
785   case ir_binop_logic_xor:
786      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
787      break;
788
789   case ir_binop_logic_or:
790      emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
791      break;
792
793   case ir_binop_logic_and:
794      emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
795      break;
796
797   case ir_binop_dot:
798   case ir_binop_cross:
799   case ir_unop_any:
800      assert(!"not reached: should be handled by brw_fs_channel_expressions");
801      break;
802
803   case ir_unop_noise:
804      assert(!"not reached: should be handled by lower_noise");
805      break;
806
807   case ir_unop_sqrt:
808      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
809      break;
810
811   case ir_unop_rsq:
812      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
813      break;
814
815   case ir_unop_i2f:
816   case ir_unop_b2f:
817   case ir_unop_b2i:
818      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
819      break;
820   case ir_unop_f2i:
821      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
822      break;
823   case ir_unop_f2b:
824   case ir_unop_i2b:
825      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f)));
826      inst->conditional_mod = BRW_CONDITIONAL_NZ;
827
828   case ir_unop_trunc:
829      emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
830      break;
831   case ir_unop_ceil:
832      op[0].negate = ~op[0].negate;
833      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
834      this->result.negate = true;
835      break;
836   case ir_unop_floor:
837      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
838      break;
839   case ir_unop_fract:
840      inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0]));
841      break;
842
843   case ir_binop_min:
844      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
845      inst->conditional_mod = BRW_CONDITIONAL_L;
846
847      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
848      inst->predicated = true;
849      break;
850   case ir_binop_max:
851      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
852      inst->conditional_mod = BRW_CONDITIONAL_G;
853
854      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
855      inst->predicated = true;
856      break;
857
858   case ir_binop_pow:
859      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
860      break;
861
862   case ir_unop_bit_not:
863   case ir_unop_u2f:
864   case ir_binop_lshift:
865   case ir_binop_rshift:
866   case ir_binop_bit_and:
867   case ir_binop_bit_xor:
868   case ir_binop_bit_or:
869      assert(!"GLSL 1.30 features unsupported");
870      break;
871   }
872}
873
874void
875fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
876				   const glsl_type *type, bool predicated)
877{
878   switch (type->base_type) {
879   case GLSL_TYPE_FLOAT:
880   case GLSL_TYPE_UINT:
881   case GLSL_TYPE_INT:
882   case GLSL_TYPE_BOOL:
883      for (unsigned int i = 0; i < type->components(); i++) {
884	 l.type = brw_type_for_base_type(type);
885	 r.type = brw_type_for_base_type(type);
886
887	 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
888	 inst->predicated = predicated;
889
890	 l.reg_offset++;
891	 r.reg_offset++;
892      }
893      break;
894   case GLSL_TYPE_ARRAY:
895      for (unsigned int i = 0; i < type->length; i++) {
896	 emit_assignment_writes(l, r, type->fields.array, predicated);
897      }
898
899   case GLSL_TYPE_STRUCT:
900      for (unsigned int i = 0; i < type->length; i++) {
901	 emit_assignment_writes(l, r, type->fields.structure[i].type,
902				predicated);
903      }
904      break;
905
906   case GLSL_TYPE_SAMPLER:
907      break;
908
909   default:
910      assert(!"not reached");
911      break;
912   }
913}
914
915void
916fs_visitor::visit(ir_assignment *ir)
917{
918   struct fs_reg l, r;
919   fs_inst *inst;
920
921   /* FINISHME: arrays on the lhs */
922   ir->lhs->accept(this);
923   l = this->result;
924
925   ir->rhs->accept(this);
926   r = this->result;
927
928   assert(l.file != BAD_FILE);
929   assert(r.file != BAD_FILE);
930
931   if (ir->condition) {
932      /* Get the condition bool into the predicate. */
933      ir->condition->accept(this);
934      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null, this->result, fs_reg(0)));
935      inst->conditional_mod = BRW_CONDITIONAL_NZ;
936   }
937
938   if (ir->lhs->type->is_scalar() ||
939       ir->lhs->type->is_vector()) {
940      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
941	 if (ir->write_mask & (1 << i)) {
942	    inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
943	    if (ir->condition)
944	       inst->predicated = true;
945	    r.reg_offset++;
946	 }
947	 l.reg_offset++;
948      }
949   } else {
950      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
951   }
952}
953
954fs_inst *
955fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
956{
957   int mlen;
958   int base_mrf = 1;
959   bool simd16 = false;
960   fs_reg orig_dst;
961
962   /* g0 header. */
963   mlen = 1;
964
965   if (ir->shadow_comparitor) {
966      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
967	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
968		      coordinate));
969	 coordinate.reg_offset++;
970      }
971      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
972      mlen += 3;
973
974      if (ir->op == ir_tex) {
975	 /* There's no plain shadow compare message, so we use shadow
976	  * compare with a bias of 0.0.
977	  */
978	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
979		      fs_reg(0.0f)));
980	 mlen++;
981      } else if (ir->op == ir_txb) {
982	 ir->lod_info.bias->accept(this);
983	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
984		      this->result));
985	 mlen++;
986      } else {
987	 assert(ir->op == ir_txl);
988	 ir->lod_info.lod->accept(this);
989	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
990		      this->result));
991	 mlen++;
992      }
993
994      ir->shadow_comparitor->accept(this);
995      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
996      mlen++;
997   } else if (ir->op == ir_tex) {
998      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
999	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1000		      coordinate));
1001	 coordinate.reg_offset++;
1002      }
1003      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1004      mlen += 3;
1005   } else {
1006      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1007       * instructions.  We'll need to do SIMD16 here.
1008       */
1009      assert(ir->op == ir_txb || ir->op == ir_txl);
1010
1011      for (int i = 0; i < ir->coordinate->type->vector_elements * 2;) {
1012	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2),
1013		      coordinate));
1014	 coordinate.reg_offset++;
1015      }
1016
1017      /* lod/bias appears after u/v/r. */
1018      mlen += 6;
1019
1020      if (ir->op == ir_txb) {
1021	 ir->lod_info.bias->accept(this);
1022	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1023		      this->result));
1024	 mlen++;
1025      } else {
1026	 ir->lod_info.lod->accept(this);
1027	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1028		      this->result));
1029	 mlen++;
1030      }
1031
1032      /* The unused upper half. */
1033      mlen++;
1034
1035      /* Now, since we're doing simd16, the return is 2 interleaved
1036       * vec4s where the odd-indexed ones are junk. We'll need to move
1037       * this weirdness around to the expected layout.
1038       */
1039      simd16 = true;
1040      orig_dst = dst;
1041      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1042						       2));
1043      dst.type = BRW_REGISTER_TYPE_F;
1044   }
1045
1046   fs_inst *inst = NULL;
1047   switch (ir->op) {
1048   case ir_tex:
1049      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1050      break;
1051   case ir_txb:
1052      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1053      break;
1054   case ir_txl:
1055      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1056      break;
1057   case ir_txd:
1058   case ir_txf:
1059      assert(!"GLSL 1.30 features unsupported");
1060      break;
1061   }
1062   inst->base_mrf = base_mrf;
1063   inst->mlen = mlen;
1064
1065   if (simd16) {
1066      for (int i = 0; i < 4; i++) {
1067	 emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst));
1068	 orig_dst.reg_offset++;
1069	 dst.reg_offset += 2;
1070      }
1071   }
1072
1073   return inst;
1074}
1075
1076fs_inst *
1077fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1078{
1079   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1080    * optional parameters like shadow comparitor or LOD bias.  If
1081    * optional parameters aren't present, those base slots are
1082    * optional and don't need to be included in the message.
1083    *
1084    * We don't fill in the unnecessary slots regardless, which may
1085    * look surprising in the disassembly.
1086    */
1087   int mlen = 1; /* g0 header always present. */
1088   int base_mrf = 1;
1089
1090   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1091      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1092		   coordinate));
1093      coordinate.reg_offset++;
1094   }
1095   mlen += ir->coordinate->type->vector_elements;
1096
1097   if (ir->shadow_comparitor) {
1098      mlen = MAX2(mlen, 5);
1099
1100      ir->shadow_comparitor->accept(this);
1101      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1102      mlen++;
1103   }
1104
1105   fs_inst *inst = NULL;
1106   switch (ir->op) {
1107   case ir_tex:
1108      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1109      break;
1110   case ir_txb:
1111      ir->lod_info.bias->accept(this);
1112      mlen = MAX2(mlen, 5);
1113      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1114      mlen++;
1115
1116      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1117      break;
1118   case ir_txl:
1119      ir->lod_info.lod->accept(this);
1120      mlen = MAX2(mlen, 5);
1121      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1122      mlen++;
1123
1124      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1125      break;
1126   case ir_txd:
1127   case ir_txf:
1128      assert(!"GLSL 1.30 features unsupported");
1129      break;
1130   }
1131   inst->base_mrf = base_mrf;
1132   inst->mlen = mlen;
1133
1134   return inst;
1135}
1136
1137void
1138fs_visitor::visit(ir_texture *ir)
1139{
1140   fs_inst *inst = NULL;
1141
1142   ir->coordinate->accept(this);
1143   fs_reg coordinate = this->result;
1144
1145   /* Should be lowered by do_lower_texture_projection */
1146   assert(!ir->projector);
1147
1148   /* Writemasking doesn't eliminate channels on SIMD8 texture
1149    * samples, so don't worry about them.
1150    */
1151   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1152
1153   if (intel->gen < 5) {
1154      inst = emit_texture_gen4(ir, dst, coordinate);
1155   } else {
1156      inst = emit_texture_gen5(ir, dst, coordinate);
1157   }
1158
1159   inst->sampler =
1160      _mesa_get_sampler_uniform_value(ir->sampler,
1161				      ctx->Shader.CurrentProgram,
1162				      &brw->fragment_program->Base);
1163   inst->sampler = c->fp->program.Base.SamplerUnits[inst->sampler];
1164
1165   this->result = dst;
1166
1167   if (ir->shadow_comparitor)
1168      inst->shadow_compare = true;
1169
1170   if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1171      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1172
1173      for (int i = 0; i < 4; i++) {
1174	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1175	 fs_reg l = swizzle_dst;
1176	 l.reg_offset += i;
1177
1178	 if (swiz == SWIZZLE_ZERO) {
1179	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f)));
1180	 } else if (swiz == SWIZZLE_ONE) {
1181	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f)));
1182	 } else {
1183	    fs_reg r = dst;
1184	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1185	    emit(fs_inst(BRW_OPCODE_MOV, l, r));
1186	 }
1187      }
1188      this->result = swizzle_dst;
1189   }
1190}
1191
1192void
1193fs_visitor::visit(ir_swizzle *ir)
1194{
1195   ir->val->accept(this);
1196   fs_reg val = this->result;
1197
1198   if (ir->type->vector_elements == 1) {
1199      this->result.reg_offset += ir->mask.x;
1200      return;
1201   }
1202
1203   fs_reg result = fs_reg(this, ir->type);
1204   this->result = result;
1205
1206   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1207      fs_reg channel = val;
1208      int swiz = 0;
1209
1210      switch (i) {
1211      case 0:
1212	 swiz = ir->mask.x;
1213	 break;
1214      case 1:
1215	 swiz = ir->mask.y;
1216	 break;
1217      case 2:
1218	 swiz = ir->mask.z;
1219	 break;
1220      case 3:
1221	 swiz = ir->mask.w;
1222	 break;
1223      }
1224
1225      channel.reg_offset += swiz;
1226      emit(fs_inst(BRW_OPCODE_MOV, result, channel));
1227      result.reg_offset++;
1228   }
1229}
1230
1231void
1232fs_visitor::visit(ir_discard *ir)
1233{
1234   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1235
1236   assert(ir->condition == NULL); /* FINISHME */
1237
1238   emit(fs_inst(FS_OPCODE_DISCARD_NOT, temp, reg_null));
1239   emit(fs_inst(FS_OPCODE_DISCARD_AND, reg_null, temp));
1240   kill_emitted = true;
1241}
1242
1243void
1244fs_visitor::visit(ir_constant *ir)
1245{
1246   fs_reg reg(this, ir->type);
1247   this->result = reg;
1248
1249   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1250      switch (ir->type->base_type) {
1251      case GLSL_TYPE_FLOAT:
1252	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.f[i])));
1253	 break;
1254      case GLSL_TYPE_UINT:
1255	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.u[i])));
1256	 break;
1257      case GLSL_TYPE_INT:
1258	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.i[i])));
1259	 break;
1260      case GLSL_TYPE_BOOL:
1261	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg((int)ir->value.b[i])));
1262	 break;
1263      default:
1264	 assert(!"Non-float/uint/int/bool constant");
1265      }
1266      reg.reg_offset++;
1267   }
1268}
1269
1270void
1271fs_visitor::visit(ir_if *ir)
1272{
1273   fs_inst *inst;
1274
1275   /* Don't point the annotation at the if statement, because then it plus
1276    * the then and else blocks get printed.
1277    */
1278   this->base_ir = ir->condition;
1279
1280   /* Generate the condition into the condition code. */
1281   ir->condition->accept(this);
1282   inst = emit(fs_inst(BRW_OPCODE_MOV, fs_reg(brw_null_reg()), this->result));
1283   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1284
1285   inst = emit(fs_inst(BRW_OPCODE_IF));
1286   inst->predicated = true;
1287
1288   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1289      ir_instruction *ir = (ir_instruction *)iter.get();
1290      this->base_ir = ir;
1291
1292      ir->accept(this);
1293   }
1294
1295   if (!ir->else_instructions.is_empty()) {
1296      emit(fs_inst(BRW_OPCODE_ELSE));
1297
1298      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1299	 ir_instruction *ir = (ir_instruction *)iter.get();
1300	 this->base_ir = ir;
1301
1302	 ir->accept(this);
1303      }
1304   }
1305
1306   emit(fs_inst(BRW_OPCODE_ENDIF));
1307}
1308
1309void
1310fs_visitor::visit(ir_loop *ir)
1311{
1312   fs_reg counter = reg_undef;
1313
1314   if (ir->counter) {
1315      this->base_ir = ir->counter;
1316      ir->counter->accept(this);
1317      counter = *(variable_storage(ir->counter));
1318
1319      if (ir->from) {
1320	 this->base_ir = ir->from;
1321	 ir->from->accept(this);
1322
1323	 emit(fs_inst(BRW_OPCODE_MOV, counter, this->result));
1324      }
1325   }
1326
1327   emit(fs_inst(BRW_OPCODE_DO));
1328
1329   if (ir->to) {
1330      this->base_ir = ir->to;
1331      ir->to->accept(this);
1332
1333      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null,
1334				   counter, this->result));
1335      switch (ir->cmp) {
1336      case ir_binop_equal:
1337	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1338	 break;
1339      case ir_binop_nequal:
1340	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1341	 break;
1342      case ir_binop_gequal:
1343	 inst->conditional_mod = BRW_CONDITIONAL_GE;
1344	 break;
1345      case ir_binop_lequal:
1346	 inst->conditional_mod = BRW_CONDITIONAL_LE;
1347	 break;
1348      case ir_binop_greater:
1349	 inst->conditional_mod = BRW_CONDITIONAL_G;
1350	 break;
1351      case ir_binop_less:
1352	 inst->conditional_mod = BRW_CONDITIONAL_L;
1353	 break;
1354      default:
1355	 assert(!"not reached: unknown loop condition");
1356	 this->fail = true;
1357	 break;
1358      }
1359
1360      inst = emit(fs_inst(BRW_OPCODE_BREAK));
1361      inst->predicated = true;
1362   }
1363
1364   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1365      ir_instruction *ir = (ir_instruction *)iter.get();
1366
1367      this->base_ir = ir;
1368      ir->accept(this);
1369   }
1370
1371   if (ir->increment) {
1372      this->base_ir = ir->increment;
1373      ir->increment->accept(this);
1374      emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result));
1375   }
1376
1377   emit(fs_inst(BRW_OPCODE_WHILE));
1378}
1379
1380void
1381fs_visitor::visit(ir_loop_jump *ir)
1382{
1383   switch (ir->mode) {
1384   case ir_loop_jump::jump_break:
1385      emit(fs_inst(BRW_OPCODE_BREAK));
1386      break;
1387   case ir_loop_jump::jump_continue:
1388      emit(fs_inst(BRW_OPCODE_CONTINUE));
1389      break;
1390   }
1391}
1392
1393void
1394fs_visitor::visit(ir_call *ir)
1395{
1396   assert(!"FINISHME");
1397}
1398
1399void
1400fs_visitor::visit(ir_return *ir)
1401{
1402   assert(!"FINISHME");
1403}
1404
1405void
1406fs_visitor::visit(ir_function *ir)
1407{
1408   /* Ignore function bodies other than main() -- we shouldn't see calls to
1409    * them since they should all be inlined before we get to ir_to_mesa.
1410    */
1411   if (strcmp(ir->name, "main") == 0) {
1412      const ir_function_signature *sig;
1413      exec_list empty;
1414
1415      sig = ir->matching_signature(&empty);
1416
1417      assert(sig);
1418
1419      foreach_iter(exec_list_iterator, iter, sig->body) {
1420	 ir_instruction *ir = (ir_instruction *)iter.get();
1421	 this->base_ir = ir;
1422
1423	 ir->accept(this);
1424      }
1425   }
1426}
1427
1428void
1429fs_visitor::visit(ir_function_signature *ir)
1430{
1431   assert(!"not reached");
1432   (void)ir;
1433}
1434
1435fs_inst *
1436fs_visitor::emit(fs_inst inst)
1437{
1438   fs_inst *list_inst = new(mem_ctx) fs_inst;
1439   *list_inst = inst;
1440
1441   list_inst->annotation = this->current_annotation;
1442   list_inst->ir = this->base_ir;
1443
1444   this->instructions.push_tail(list_inst);
1445
1446   return list_inst;
1447}
1448
1449/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1450void
1451fs_visitor::emit_dummy_fs()
1452{
1453   /* Everyone's favorite color. */
1454   emit(fs_inst(BRW_OPCODE_MOV,
1455		fs_reg(MRF, 2),
1456		fs_reg(1.0f)));
1457   emit(fs_inst(BRW_OPCODE_MOV,
1458		fs_reg(MRF, 3),
1459		fs_reg(0.0f)));
1460   emit(fs_inst(BRW_OPCODE_MOV,
1461		fs_reg(MRF, 4),
1462		fs_reg(1.0f)));
1463   emit(fs_inst(BRW_OPCODE_MOV,
1464		fs_reg(MRF, 5),
1465		fs_reg(0.0f)));
1466
1467   fs_inst *write;
1468   write = emit(fs_inst(FS_OPCODE_FB_WRITE,
1469			fs_reg(0),
1470			fs_reg(0)));
1471   write->base_mrf = 0;
1472}
1473
1474/* The register location here is relative to the start of the URB
1475 * data.  It will get adjusted to be a real location before
1476 * generate_code() time.
1477 */
1478struct brw_reg
1479fs_visitor::interp_reg(int location, int channel)
1480{
1481   int regnr = urb_setup[location] * 2 + channel / 2;
1482   int stride = (channel & 1) * 4;
1483
1484   assert(urb_setup[location] != -1);
1485
1486   return brw_vec1_grf(regnr, stride);
1487}
1488
1489/** Emits the interpolation for the varying inputs. */
1490void
1491fs_visitor::emit_interpolation_setup_gen4()
1492{
1493   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1494
1495   this->current_annotation = "compute pixel centers";
1496   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1497   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1498   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1499   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1500   emit(fs_inst(BRW_OPCODE_ADD,
1501		this->pixel_x,
1502		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1503		fs_reg(brw_imm_v(0x10101010))));
1504   emit(fs_inst(BRW_OPCODE_ADD,
1505		this->pixel_y,
1506		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1507		fs_reg(brw_imm_v(0x11001100))));
1508
1509   this->current_annotation = "compute pixel deltas from v0";
1510   if (brw->has_pln) {
1511      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1512      this->delta_y = this->delta_x;
1513      this->delta_y.reg_offset++;
1514   } else {
1515      this->delta_x = fs_reg(this, glsl_type::float_type);
1516      this->delta_y = fs_reg(this, glsl_type::float_type);
1517   }
1518   emit(fs_inst(BRW_OPCODE_ADD,
1519		this->delta_x,
1520		this->pixel_x,
1521		fs_reg(negate(brw_vec1_grf(1, 0)))));
1522   emit(fs_inst(BRW_OPCODE_ADD,
1523		this->delta_y,
1524		this->pixel_y,
1525		fs_reg(negate(brw_vec1_grf(1, 1)))));
1526
1527   this->current_annotation = "compute pos.w and 1/pos.w";
1528   /* Compute wpos.w.  It's always in our setup, since it's needed to
1529    * interpolate the other attributes.
1530    */
1531   this->wpos_w = fs_reg(this, glsl_type::float_type);
1532   emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1533		interp_reg(FRAG_ATTRIB_WPOS, 3)));
1534   /* Compute the pixel 1/W value from wpos.w. */
1535   this->pixel_w = fs_reg(this, glsl_type::float_type);
1536   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1537   this->current_annotation = NULL;
1538}
1539
1540/** Emits the interpolation for the varying inputs. */
1541void
1542fs_visitor::emit_interpolation_setup_gen6()
1543{
1544   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1545
1546   /* If the pixel centers end up used, the setup is the same as for gen4. */
1547   this->current_annotation = "compute pixel centers";
1548   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1549   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1550   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1551   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1552   emit(fs_inst(BRW_OPCODE_ADD,
1553		this->pixel_x,
1554		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1555		fs_reg(brw_imm_v(0x10101010))));
1556   emit(fs_inst(BRW_OPCODE_ADD,
1557		this->pixel_y,
1558		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1559		fs_reg(brw_imm_v(0x11001100))));
1560
1561   this->current_annotation = "compute 1/pos.w";
1562   this->wpos_w = fs_reg(brw_vec8_grf(c->key.source_w_reg, 0));
1563   this->pixel_w = fs_reg(this, glsl_type::float_type);
1564   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1565
1566   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
1567   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
1568
1569   this->current_annotation = NULL;
1570}
1571
1572void
1573fs_visitor::emit_fb_writes()
1574{
1575   this->current_annotation = "FB write header";
1576   GLboolean header_present = GL_TRUE;
1577   int nr = 0;
1578
1579   if (intel->gen >= 6 &&
1580       !this->kill_emitted &&
1581       c->key.nr_color_regions == 1) {
1582      header_present = false;
1583   }
1584
1585   if (header_present) {
1586      /* m0, m1 header */
1587      nr += 2;
1588   }
1589
1590   if (c->key.aa_dest_stencil_reg) {
1591      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1592		   fs_reg(brw_vec8_grf(c->key.aa_dest_stencil_reg, 0))));
1593   }
1594
1595   /* Reserve space for color. It'll be filled in per MRT below. */
1596   int color_mrf = nr;
1597   nr += 4;
1598
1599   if (c->key.source_depth_to_render_target) {
1600      if (c->key.computes_depth) {
1601	 /* Hand over gl_FragDepth. */
1602	 assert(this->frag_depth);
1603	 fs_reg depth = *(variable_storage(this->frag_depth));
1604
1605	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth));
1606      } else {
1607	 /* Pass through the payload depth. */
1608	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1609		      fs_reg(brw_vec8_grf(c->key.source_depth_reg, 0))));
1610      }
1611   }
1612
1613   if (c->key.dest_depth_reg) {
1614      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1615		   fs_reg(brw_vec8_grf(c->key.dest_depth_reg, 0))));
1616   }
1617
1618   fs_reg color = reg_undef;
1619   if (this->frag_color)
1620      color = *(variable_storage(this->frag_color));
1621   else if (this->frag_data)
1622      color = *(variable_storage(this->frag_data));
1623
1624   for (int target = 0; target < c->key.nr_color_regions; target++) {
1625      this->current_annotation = talloc_asprintf(this->mem_ctx,
1626						 "FB write target %d",
1627						 target);
1628      if (this->frag_color || this->frag_data) {
1629	 for (int i = 0; i < 4; i++) {
1630	    emit(fs_inst(BRW_OPCODE_MOV,
1631			 fs_reg(MRF, color_mrf + i),
1632			 color));
1633	    color.reg_offset++;
1634	 }
1635      }
1636
1637      if (this->frag_color)
1638	 color.reg_offset -= 4;
1639
1640      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
1641				   reg_undef, reg_undef));
1642      inst->target = target;
1643      inst->base_mrf = 0;
1644      inst->mlen = nr;
1645      if (target == c->key.nr_color_regions - 1)
1646	 inst->eot = true;
1647      inst->header_present = header_present;
1648   }
1649
1650   if (c->key.nr_color_regions == 0) {
1651      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
1652				   reg_undef, reg_undef));
1653      inst->base_mrf = 0;
1654      inst->mlen = nr;
1655      inst->eot = true;
1656      inst->header_present = header_present;
1657   }
1658
1659   this->current_annotation = NULL;
1660}
1661
1662void
1663fs_visitor::generate_fb_write(fs_inst *inst)
1664{
1665   GLboolean eot = inst->eot;
1666   struct brw_reg implied_header;
1667
1668   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
1669    * move, here's g1.
1670    */
1671   brw_push_insn_state(p);
1672   brw_set_mask_control(p, BRW_MASK_DISABLE);
1673   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1674
1675   if (inst->header_present) {
1676      if (intel->gen >= 6) {
1677	 brw_MOV(p,
1678		 brw_message_reg(inst->base_mrf),
1679		 brw_vec8_grf(0, 0));
1680	 implied_header = brw_null_reg();
1681      } else {
1682	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
1683      }
1684
1685      brw_MOV(p,
1686	      brw_message_reg(inst->base_mrf + 1),
1687	      brw_vec8_grf(1, 0));
1688   } else {
1689      implied_header = brw_null_reg();
1690   }
1691
1692   brw_pop_insn_state(p);
1693
1694   brw_fb_WRITE(p,
1695		8, /* dispatch_width */
1696		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
1697		inst->base_mrf,
1698		implied_header,
1699		inst->target,
1700		inst->mlen,
1701		0,
1702		eot);
1703}
1704
1705void
1706fs_visitor::generate_linterp(fs_inst *inst,
1707			     struct brw_reg dst, struct brw_reg *src)
1708{
1709   struct brw_reg delta_x = src[0];
1710   struct brw_reg delta_y = src[1];
1711   struct brw_reg interp = src[2];
1712
1713   if (brw->has_pln &&
1714       delta_y.nr == delta_x.nr + 1 &&
1715       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
1716      brw_PLN(p, dst, interp, delta_x);
1717   } else {
1718      brw_LINE(p, brw_null_reg(), interp, delta_x);
1719      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
1720   }
1721}
1722
1723void
1724fs_visitor::generate_math(fs_inst *inst,
1725			  struct brw_reg dst, struct brw_reg *src)
1726{
1727   int op;
1728
1729   switch (inst->opcode) {
1730   case FS_OPCODE_RCP:
1731      op = BRW_MATH_FUNCTION_INV;
1732      break;
1733   case FS_OPCODE_RSQ:
1734      op = BRW_MATH_FUNCTION_RSQ;
1735      break;
1736   case FS_OPCODE_SQRT:
1737      op = BRW_MATH_FUNCTION_SQRT;
1738      break;
1739   case FS_OPCODE_EXP2:
1740      op = BRW_MATH_FUNCTION_EXP;
1741      break;
1742   case FS_OPCODE_LOG2:
1743      op = BRW_MATH_FUNCTION_LOG;
1744      break;
1745   case FS_OPCODE_POW:
1746      op = BRW_MATH_FUNCTION_POW;
1747      break;
1748   case FS_OPCODE_SIN:
1749      op = BRW_MATH_FUNCTION_SIN;
1750      break;
1751   case FS_OPCODE_COS:
1752      op = BRW_MATH_FUNCTION_COS;
1753      break;
1754   default:
1755      assert(!"not reached: unknown math function");
1756      op = 0;
1757      break;
1758   }
1759
1760   assert(inst->mlen >= 1);
1761
1762   if (inst->opcode == FS_OPCODE_POW) {
1763      brw_MOV(p, brw_message_reg(inst->base_mrf + 1), src[1]);
1764   }
1765
1766   brw_math(p, dst,
1767	    op,
1768	    inst->saturate ? BRW_MATH_SATURATE_SATURATE :
1769	    BRW_MATH_SATURATE_NONE,
1770	    inst->base_mrf, src[0],
1771	    BRW_MATH_DATA_VECTOR,
1772	    BRW_MATH_PRECISION_FULL);
1773}
1774
1775void
1776fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst)
1777{
1778   int msg_type = -1;
1779   int rlen = 4;
1780   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1781
1782   if (intel->gen >= 5) {
1783      switch (inst->opcode) {
1784      case FS_OPCODE_TEX:
1785	 if (inst->shadow_compare) {
1786	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
1787	 } else {
1788	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
1789	 }
1790	 break;
1791      case FS_OPCODE_TXB:
1792	 if (inst->shadow_compare) {
1793	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE_GEN5;
1794	 } else {
1795	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
1796	 }
1797	 break;
1798      }
1799   } else {
1800      switch (inst->opcode) {
1801      case FS_OPCODE_TEX:
1802	 /* Note that G45 and older determines shadow compare and dispatch width
1803	  * from message length for most messages.
1804	  */
1805	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
1806	 if (inst->shadow_compare) {
1807	    assert(inst->mlen == 5);
1808	 } else {
1809	    assert(inst->mlen <= 6);
1810	 }
1811	 break;
1812      case FS_OPCODE_TXB:
1813	 if (inst->shadow_compare) {
1814	    assert(inst->mlen == 5);
1815	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
1816	 } else {
1817	    assert(inst->mlen == 8);
1818	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1819	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1820	 }
1821	 break;
1822      }
1823   }
1824   assert(msg_type != -1);
1825
1826   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
1827      rlen = 8;
1828      dst = vec16(dst);
1829   }
1830
1831   brw_SAMPLE(p,
1832	      retype(dst, BRW_REGISTER_TYPE_UW),
1833	      inst->base_mrf,
1834	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1835              SURF_INDEX_TEXTURE(inst->sampler),
1836	      inst->sampler,
1837	      WRITEMASK_XYZW,
1838	      msg_type,
1839	      rlen,
1840	      inst->mlen,
1841	      0,
1842	      1,
1843	      simd_mode);
1844}
1845
1846
1847/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1848 * looking like:
1849 *
1850 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1851 *
1852 * and we're trying to produce:
1853 *
1854 *           DDX                     DDY
1855 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
1856 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
1857 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
1858 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
1859 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
1860 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
1861 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
1862 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
1863 *
1864 * and add another set of two more subspans if in 16-pixel dispatch mode.
1865 *
1866 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1867 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1868 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
1869 * between each other.  We could probably do it like ddx and swizzle the right
1870 * order later, but bail for now and just produce
1871 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
1872 */
1873void
1874fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
1875{
1876   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
1877				 BRW_REGISTER_TYPE_F,
1878				 BRW_VERTICAL_STRIDE_2,
1879				 BRW_WIDTH_2,
1880				 BRW_HORIZONTAL_STRIDE_0,
1881				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1882   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1883				 BRW_REGISTER_TYPE_F,
1884				 BRW_VERTICAL_STRIDE_2,
1885				 BRW_WIDTH_2,
1886				 BRW_HORIZONTAL_STRIDE_0,
1887				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1888   brw_ADD(p, dst, src0, negate(src1));
1889}
1890
1891void
1892fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
1893{
1894   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1895				 BRW_REGISTER_TYPE_F,
1896				 BRW_VERTICAL_STRIDE_4,
1897				 BRW_WIDTH_4,
1898				 BRW_HORIZONTAL_STRIDE_0,
1899				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1900   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1901				 BRW_REGISTER_TYPE_F,
1902				 BRW_VERTICAL_STRIDE_4,
1903				 BRW_WIDTH_4,
1904				 BRW_HORIZONTAL_STRIDE_0,
1905				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1906   brw_ADD(p, dst, src0, negate(src1));
1907}
1908
1909void
1910fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
1911{
1912   brw_push_insn_state(p);
1913   brw_set_mask_control(p, BRW_MASK_DISABLE);
1914   brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
1915   brw_pop_insn_state(p);
1916}
1917
1918void
1919fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
1920{
1921   struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1922   mask = brw_uw1_reg(mask.file, mask.nr, 0);
1923
1924   brw_push_insn_state(p);
1925   brw_set_mask_control(p, BRW_MASK_DISABLE);
1926   brw_AND(p, g0, mask, g0);
1927   brw_pop_insn_state(p);
1928}
1929
1930void
1931fs_visitor::assign_curb_setup()
1932{
1933   c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
1934   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1935
1936   /* Map the offsets in the UNIFORM file to fixed HW regs. */
1937   foreach_iter(exec_list_iterator, iter, this->instructions) {
1938      fs_inst *inst = (fs_inst *)iter.get();
1939
1940      for (unsigned int i = 0; i < 3; i++) {
1941	 if (inst->src[i].file == UNIFORM) {
1942	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
1943	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
1944						  constant_nr / 8,
1945						  constant_nr % 8);
1946
1947	    inst->src[i].file = FIXED_HW_REG;
1948	    inst->src[i].fixed_hw_reg = brw_reg;
1949	 }
1950      }
1951   }
1952}
1953
1954void
1955fs_visitor::calculate_urb_setup()
1956{
1957   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1958      urb_setup[i] = -1;
1959   }
1960
1961   int urb_next = 0;
1962   /* Figure out where each of the incoming setup attributes lands. */
1963   if (intel->gen >= 6) {
1964      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1965	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
1966	    urb_setup[i] = urb_next++;
1967	 }
1968      }
1969   } else {
1970      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1971      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1972	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1973	    int fp_index;
1974
1975	    if (i >= VERT_RESULT_VAR0)
1976	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
1977	    else if (i <= VERT_RESULT_TEX7)
1978	       fp_index = i;
1979	    else
1980	       fp_index = -1;
1981
1982	    if (fp_index >= 0)
1983	       urb_setup[fp_index] = urb_next++;
1984	 }
1985      }
1986   }
1987
1988   /* Each attribute is 4 setup channels, each of which is half a reg. */
1989   c->prog_data.urb_read_length = urb_next * 2;
1990}
1991
1992void
1993fs_visitor::assign_urb_setup()
1994{
1995   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
1996
1997   /* Offset all the urb_setup[] index by the actual position of the
1998    * setup regs, now that the location of the constants has been chosen.
1999    */
2000   foreach_iter(exec_list_iterator, iter, this->instructions) {
2001      fs_inst *inst = (fs_inst *)iter.get();
2002
2003      if (inst->opcode != FS_OPCODE_LINTERP)
2004	 continue;
2005
2006      assert(inst->src[2].file == FIXED_HW_REG);
2007
2008      inst->src[2].fixed_hw_reg.nr += urb_start;
2009   }
2010
2011   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2012}
2013
2014static void
2015assign_reg(int *reg_hw_locations, fs_reg *reg)
2016{
2017   if (reg->file == GRF && reg->reg != 0) {
2018      reg->hw_reg = reg_hw_locations[reg->reg] + reg->reg_offset;
2019      reg->reg = 0;
2020   }
2021}
2022
2023void
2024fs_visitor::assign_regs_trivial()
2025{
2026   int last_grf = 0;
2027   int hw_reg_mapping[this->virtual_grf_next];
2028   int i;
2029
2030   hw_reg_mapping[0] = 0;
2031   hw_reg_mapping[1] = this->first_non_payload_grf;
2032   for (i = 2; i < this->virtual_grf_next; i++) {
2033      hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
2034			   this->virtual_grf_sizes[i - 1]);
2035   }
2036   last_grf = hw_reg_mapping[i - 1] + this->virtual_grf_sizes[i - 1];
2037
2038   foreach_iter(exec_list_iterator, iter, this->instructions) {
2039      fs_inst *inst = (fs_inst *)iter.get();
2040
2041      assign_reg(hw_reg_mapping, &inst->dst);
2042      assign_reg(hw_reg_mapping, &inst->src[0]);
2043      assign_reg(hw_reg_mapping, &inst->src[1]);
2044   }
2045
2046   this->grf_used = last_grf + 1;
2047}
2048
2049void
2050fs_visitor::assign_regs()
2051{
2052   int last_grf = 0;
2053   int hw_reg_mapping[this->virtual_grf_next + 1];
2054   int base_reg_count = BRW_MAX_GRF - this->first_non_payload_grf;
2055   int class_sizes[base_reg_count];
2056   int class_count = 0;
2057   int aligned_pair_class = -1;
2058
2059   /* Set up the register classes.
2060    *
2061    * The base registers store a scalar value.  For texture samples,
2062    * we get virtual GRFs composed of 4 contiguous hw register.  For
2063    * structures and arrays, we store them as contiguous larger things
2064    * than that, though we should be able to do better most of the
2065    * time.
2066    */
2067   class_sizes[class_count++] = 1;
2068   if (brw->has_pln && intel->gen < 6) {
2069      /* Always set up the (unaligned) pairs for gen5, so we can find
2070       * them for making the aligned pair class.
2071       */
2072      class_sizes[class_count++] = 2;
2073   }
2074   for (int r = 1; r < this->virtual_grf_next; r++) {
2075      int i;
2076
2077      for (i = 0; i < class_count; i++) {
2078	 if (class_sizes[i] == this->virtual_grf_sizes[r])
2079	    break;
2080      }
2081      if (i == class_count) {
2082	 if (this->virtual_grf_sizes[r] >= base_reg_count) {
2083	    fprintf(stderr, "Object too large to register allocate.\n");
2084	    this->fail = true;
2085	 }
2086
2087	 class_sizes[class_count++] = this->virtual_grf_sizes[r];
2088      }
2089   }
2090
2091   int ra_reg_count = 0;
2092   int class_base_reg[class_count];
2093   int class_reg_count[class_count];
2094   int classes[class_count + 1];
2095
2096   for (int i = 0; i < class_count; i++) {
2097      class_base_reg[i] = ra_reg_count;
2098      class_reg_count[i] = base_reg_count - (class_sizes[i] - 1);
2099      ra_reg_count += class_reg_count[i];
2100   }
2101
2102   struct ra_regs *regs = ra_alloc_reg_set(ra_reg_count);
2103   for (int i = 0; i < class_count; i++) {
2104      classes[i] = ra_alloc_reg_class(regs);
2105
2106      for (int i_r = 0; i_r < class_reg_count[i]; i_r++) {
2107	 ra_class_add_reg(regs, classes[i], class_base_reg[i] + i_r);
2108      }
2109
2110      /* Add conflicts between our contiguous registers aliasing
2111       * base regs and other register classes' contiguous registers
2112       * that alias base regs, or the base regs themselves for classes[0].
2113       */
2114      for (int c = 0; c <= i; c++) {
2115	 for (int i_r = 0; i_r < class_reg_count[i]; i_r++) {
2116	    for (int c_r = MAX2(0, i_r - (class_sizes[c] - 1));
2117		 c_r < MIN2(class_reg_count[c], i_r + class_sizes[i]);
2118		 c_r++) {
2119
2120	       if (0) {
2121		  printf("%d/%d conflicts %d/%d\n",
2122			 class_sizes[i], this->first_non_payload_grf + i_r,
2123			 class_sizes[c], this->first_non_payload_grf + c_r);
2124	       }
2125
2126	       ra_add_reg_conflict(regs,
2127				   class_base_reg[i] + i_r,
2128				   class_base_reg[c] + c_r);
2129	    }
2130	 }
2131      }
2132   }
2133
2134   /* Add a special class for aligned pairs, which we'll put delta_x/y
2135    * in on gen5 so that we can do PLN.
2136    */
2137   if (brw->has_pln && intel->gen < 6) {
2138      int reg_count = (base_reg_count - 1) / 2;
2139      int unaligned_pair_class = 1;
2140      assert(class_sizes[unaligned_pair_class] == 2);
2141
2142      aligned_pair_class = class_count;
2143      classes[aligned_pair_class] = ra_alloc_reg_class(regs);
2144      class_base_reg[aligned_pair_class] = 0;
2145      class_reg_count[aligned_pair_class] = 0;
2146      int start = (this->first_non_payload_grf & 1) ? 1 : 0;
2147
2148      for (int i = 0; i < reg_count; i++) {
2149	 ra_class_add_reg(regs, classes[aligned_pair_class],
2150			  class_base_reg[unaligned_pair_class] + i * 2 + start);
2151      }
2152      class_count++;
2153   }
2154
2155   ra_set_finalize(regs);
2156
2157   struct ra_graph *g = ra_alloc_interference_graph(regs,
2158						    this->virtual_grf_next);
2159   /* Node 0 is just a placeholder to keep virtual_grf[] mapping 1:1
2160    * with nodes.
2161    */
2162   ra_set_node_class(g, 0, classes[0]);
2163
2164   for (int i = 1; i < this->virtual_grf_next; i++) {
2165      for (int c = 0; c < class_count; c++) {
2166	 if (class_sizes[c] == this->virtual_grf_sizes[i]) {
2167	    if (aligned_pair_class >= 0 &&
2168		this->delta_x.reg == i) {
2169	       ra_set_node_class(g, i, classes[aligned_pair_class]);
2170	    } else {
2171	       ra_set_node_class(g, i, classes[c]);
2172	    }
2173	    break;
2174	 }
2175      }
2176
2177      for (int j = 1; j < i; j++) {
2178	 if (virtual_grf_interferes(i, j)) {
2179	    ra_add_node_interference(g, i, j);
2180	 }
2181      }
2182   }
2183
2184   /* FINISHME: Handle spilling */
2185   if (!ra_allocate_no_spills(g)) {
2186      fprintf(stderr, "Failed to allocate registers.\n");
2187      this->fail = true;
2188      return;
2189   }
2190
2191   /* Get the chosen virtual registers for each node, and map virtual
2192    * regs in the register classes back down to real hardware reg
2193    * numbers.
2194    */
2195   hw_reg_mapping[0] = 0; /* unused */
2196   for (int i = 1; i < this->virtual_grf_next; i++) {
2197      int reg = ra_get_node_reg(g, i);
2198      int hw_reg = -1;
2199
2200      for (int c = 0; c < class_count; c++) {
2201	 if (reg >= class_base_reg[c] &&
2202	     reg < class_base_reg[c] + class_reg_count[c]) {
2203	    hw_reg = reg - class_base_reg[c];
2204	    break;
2205	 }
2206      }
2207
2208      assert(hw_reg != -1);
2209      hw_reg_mapping[i] = this->first_non_payload_grf + hw_reg;
2210      last_grf = MAX2(last_grf,
2211		      hw_reg_mapping[i] + this->virtual_grf_sizes[i] - 1);
2212   }
2213
2214   foreach_iter(exec_list_iterator, iter, this->instructions) {
2215      fs_inst *inst = (fs_inst *)iter.get();
2216
2217      assign_reg(hw_reg_mapping, &inst->dst);
2218      assign_reg(hw_reg_mapping, &inst->src[0]);
2219      assign_reg(hw_reg_mapping, &inst->src[1]);
2220   }
2221
2222   this->grf_used = last_grf + 1;
2223
2224   talloc_free(g);
2225   talloc_free(regs);
2226}
2227
2228void
2229fs_visitor::calculate_live_intervals()
2230{
2231   int num_vars = this->virtual_grf_next;
2232   int *def = talloc_array(mem_ctx, int, num_vars);
2233   int *use = talloc_array(mem_ctx, int, num_vars);
2234   int loop_depth = 0;
2235   int loop_start = 0;
2236
2237   for (int i = 0; i < num_vars; i++) {
2238      def[i] = 1 << 30;
2239      use[i] = -1;
2240   }
2241
2242   int ip = 0;
2243   foreach_iter(exec_list_iterator, iter, this->instructions) {
2244      fs_inst *inst = (fs_inst *)iter.get();
2245
2246      if (inst->opcode == BRW_OPCODE_DO) {
2247	 if (loop_depth++ == 0)
2248	    loop_start = ip;
2249      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2250	 loop_depth--;
2251
2252	 if (loop_depth == 0) {
2253	    /* FINISHME:
2254	     *
2255	     * Patches up any vars marked for use within the loop as
2256	     * live until the end.  This is conservative, as there
2257	     * will often be variables defined and used inside the
2258	     * loop but dead at the end of the loop body.
2259	     */
2260	    for (int i = 0; i < num_vars; i++) {
2261	       if (use[i] == loop_start) {
2262		  use[i] = ip;
2263	       }
2264	    }
2265	 }
2266      } else {
2267	 int eip = ip;
2268
2269	 if (loop_depth)
2270	    eip = loop_start;
2271
2272	 for (unsigned int i = 0; i < 3; i++) {
2273	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2274	       use[inst->src[i].reg] = MAX2(use[inst->src[i].reg], eip);
2275	    }
2276	 }
2277	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2278	    def[inst->dst.reg] = MIN2(def[inst->dst.reg], eip);
2279	 }
2280      }
2281
2282      ip++;
2283   }
2284
2285   talloc_free(this->virtual_grf_def);
2286   talloc_free(this->virtual_grf_use);
2287   this->virtual_grf_def = def;
2288   this->virtual_grf_use = use;
2289}
2290
2291/**
2292 * Attempts to move immediate constants into the immediate
2293 * constant slot of following instructions.
2294 *
2295 * Immediate constants are a bit tricky -- they have to be in the last
2296 * operand slot, you can't do abs/negate on them,
2297 */
2298
2299bool
2300fs_visitor::propagate_constants()
2301{
2302   bool progress = false;
2303
2304   foreach_iter(exec_list_iterator, iter, this->instructions) {
2305      fs_inst *inst = (fs_inst *)iter.get();
2306
2307      if (inst->opcode != BRW_OPCODE_MOV ||
2308	  inst->predicated ||
2309	  inst->dst.file != GRF || inst->src[0].file != IMM ||
2310	  inst->dst.type != inst->src[0].type)
2311	 continue;
2312
2313      /* Don't bother with cases where we should have had the
2314       * operation on the constant folded in GLSL already.
2315       */
2316      if (inst->saturate)
2317	 continue;
2318
2319      /* Found a move of a constant to a GRF.  Find anything else using the GRF
2320       * before it's written, and replace it with the constant if we can.
2321       */
2322      exec_list_iterator scan_iter = iter;
2323      scan_iter.next();
2324      for (; scan_iter.has_next(); scan_iter.next()) {
2325	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2326
2327	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2328	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2329	     scan_inst->opcode == BRW_OPCODE_ELSE ||
2330	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2331	    break;
2332	 }
2333
2334	 for (int i = 2; i >= 0; i--) {
2335	    if (scan_inst->src[i].file != GRF ||
2336		scan_inst->src[i].reg != inst->dst.reg ||
2337		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2338	       continue;
2339
2340	    /* Don't bother with cases where we should have had the
2341	     * operation on the constant folded in GLSL already.
2342	     */
2343	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2344	       continue;
2345
2346	    switch (scan_inst->opcode) {
2347	    case BRW_OPCODE_MOV:
2348	       scan_inst->src[i] = inst->src[0];
2349	       progress = true;
2350	       break;
2351
2352	    case BRW_OPCODE_MUL:
2353	    case BRW_OPCODE_ADD:
2354	       if (i == 1) {
2355		  scan_inst->src[i] = inst->src[0];
2356		  progress = true;
2357	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
2358		  /* Fit this constant in by commuting the operands */
2359		  scan_inst->src[0] = scan_inst->src[1];
2360		  scan_inst->src[1] = inst->src[0];
2361	       }
2362	       break;
2363	    case BRW_OPCODE_CMP:
2364	       if (i == 1) {
2365		  scan_inst->src[i] = inst->src[0];
2366		  progress = true;
2367	       }
2368	    }
2369	 }
2370
2371	 if (scan_inst->dst.file == GRF &&
2372	     scan_inst->dst.reg == inst->dst.reg &&
2373	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2374	      scan_inst->opcode == FS_OPCODE_TEX)) {
2375	    break;
2376	 }
2377      }
2378   }
2379
2380   return progress;
2381}
2382/**
2383 * Must be called after calculate_live_intervales() to remove unused
2384 * writes to registers -- register allocation will fail otherwise
2385 * because something deffed but not used won't be considered to
2386 * interfere with other regs.
2387 */
2388bool
2389fs_visitor::dead_code_eliminate()
2390{
2391   bool progress = false;
2392   int num_vars = this->virtual_grf_next;
2393   bool dead[num_vars];
2394
2395   for (int i = 0; i < num_vars; i++) {
2396      dead[i] = this->virtual_grf_def[i] >= this->virtual_grf_use[i];
2397
2398      if (dead[i]) {
2399	 /* Mark off its interval so it won't interfere with anything. */
2400	 this->virtual_grf_def[i] = -1;
2401	 this->virtual_grf_use[i] = -1;
2402      }
2403   }
2404
2405   foreach_iter(exec_list_iterator, iter, this->instructions) {
2406      fs_inst *inst = (fs_inst *)iter.get();
2407
2408      if (inst->dst.file == GRF && dead[inst->dst.reg]) {
2409	 inst->remove();
2410	 progress = true;
2411      }
2412   }
2413
2414   return progress;
2415}
2416
2417bool
2418fs_visitor::register_coalesce()
2419{
2420   bool progress = false;
2421
2422   foreach_iter(exec_list_iterator, iter, this->instructions) {
2423      fs_inst *inst = (fs_inst *)iter.get();
2424
2425      if (inst->opcode != BRW_OPCODE_MOV ||
2426	  inst->predicated ||
2427	  inst->saturate ||
2428	  inst->dst.file != GRF || inst->src[0].file != GRF ||
2429	  inst->dst.type != inst->src[0].type)
2430	 continue;
2431
2432      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2433       * them: check for no writes to either one until the exit of the
2434       * program.
2435       */
2436      bool interfered = false;
2437      exec_list_iterator scan_iter = iter;
2438      scan_iter.next();
2439      for (; scan_iter.has_next(); scan_iter.next()) {
2440	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2441
2442	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2443	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2444	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2445	    interfered = true;
2446	    iter = scan_iter;
2447	    break;
2448	 }
2449
2450	 if (scan_inst->dst.file == GRF) {
2451	    if (scan_inst->dst.reg == inst->dst.reg &&
2452		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2453		 scan_inst->opcode == FS_OPCODE_TEX)) {
2454	       interfered = true;
2455	       break;
2456	    }
2457	    if (scan_inst->dst.reg == inst->src[0].reg &&
2458		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
2459		 scan_inst->opcode == FS_OPCODE_TEX)) {
2460	       interfered = true;
2461	       break;
2462	    }
2463	 }
2464      }
2465      if (interfered) {
2466	 continue;
2467      }
2468
2469      /* Rewrite the later usage to point at the source of the move to
2470       * be removed.
2471       */
2472      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
2473	   scan_iter.next()) {
2474	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2475
2476	 for (int i = 0; i < 3; i++) {
2477	    if (scan_inst->src[i].file == GRF &&
2478		scan_inst->src[i].reg == inst->dst.reg &&
2479		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2480	       scan_inst->src[i].reg = inst->src[0].reg;
2481	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
2482	       scan_inst->src[i].abs |= inst->src[0].abs;
2483	       scan_inst->src[i].negate ^= inst->src[0].negate;
2484	    }
2485	 }
2486      }
2487
2488      inst->remove();
2489      progress = true;
2490   }
2491
2492   return progress;
2493}
2494
2495
2496bool
2497fs_visitor::compute_to_mrf()
2498{
2499   bool progress = false;
2500   int next_ip = 0;
2501
2502   foreach_iter(exec_list_iterator, iter, this->instructions) {
2503      fs_inst *inst = (fs_inst *)iter.get();
2504
2505      int ip = next_ip;
2506      next_ip++;
2507
2508      if (inst->opcode != BRW_OPCODE_MOV ||
2509	  inst->predicated ||
2510	  inst->dst.file != MRF || inst->src[0].file != GRF ||
2511	  inst->dst.type != inst->src[0].type ||
2512	  inst->src[0].abs || inst->src[0].negate)
2513	 continue;
2514
2515      /* Can't compute-to-MRF this GRF if someone else was going to
2516       * read it later.
2517       */
2518      if (this->virtual_grf_use[inst->src[0].reg] > ip)
2519	 continue;
2520
2521      /* Found a move of a GRF to a MRF.  Let's see if we can go
2522       * rewrite the thing that made this GRF to write into the MRF.
2523       */
2524      bool found = false;
2525      fs_inst *scan_inst;
2526      for (scan_inst = (fs_inst *)inst->prev;
2527	   scan_inst->prev != NULL;
2528	   scan_inst = (fs_inst *)scan_inst->prev) {
2529	 /* We don't handle flow control here.  Most computation of
2530	  * values that end up in MRFs are shortly before the MRF
2531	  * write anyway.
2532	  */
2533	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2534	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2535	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2536	    break;
2537	 }
2538
2539	 /* You can't read from an MRF, so if someone else reads our
2540	  * MRF's source GRF that we wanted to rewrite, that stops us.
2541	  */
2542	 bool interfered = false;
2543	 for (int i = 0; i < 3; i++) {
2544	    if (scan_inst->src[i].file == GRF &&
2545		scan_inst->src[i].reg == inst->src[0].reg &&
2546		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2547	       interfered = true;
2548	    }
2549	 }
2550	 if (interfered)
2551	    break;
2552
2553	 if (scan_inst->dst.file == MRF &&
2554	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
2555	    /* Somebody else wrote our MRF here, so we can't can't
2556	     * compute-to-MRF before that.
2557	     */
2558	    break;
2559	 }
2560
2561	 if (scan_inst->mlen > 0) {
2562	    /* Found a SEND instruction, which will do some amount of
2563	     * implied write that may overwrite our MRF that we were
2564	     * hoping to compute-to-MRF somewhere above it.  Nothing
2565	     * we have implied-writes more than 2 MRFs from base_mrf,
2566	     * though.
2567	     */
2568	    int implied_write_len = MIN2(scan_inst->mlen, 2);
2569	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
2570		inst->dst.hw_reg < scan_inst->base_mrf + implied_write_len) {
2571	       break;
2572	    }
2573	 }
2574
2575	 if (scan_inst->dst.file == GRF &&
2576	     scan_inst->dst.reg == inst->src[0].reg) {
2577	    /* Found the last thing to write our reg we want to turn
2578	     * into a compute-to-MRF.
2579	     */
2580
2581	    if (scan_inst->opcode == FS_OPCODE_TEX) {
2582	       /* texturing writes several continuous regs, so we can't
2583		* compute-to-mrf that.
2584		*/
2585	       break;
2586	    }
2587
2588	    /* If it's predicated, it (probably) didn't populate all
2589	     * the channels.
2590	     */
2591	    if (scan_inst->predicated)
2592	       break;
2593
2594	    /* SEND instructions can't have MRF as a destination. */
2595	    if (scan_inst->mlen)
2596	       break;
2597
2598	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2599	       /* Found the creator of our MRF's source value. */
2600	       found = true;
2601	       break;
2602	    }
2603	 }
2604      }
2605      if (found) {
2606	 scan_inst->dst.file = MRF;
2607	 scan_inst->dst.hw_reg = inst->dst.hw_reg;
2608	 scan_inst->saturate |= inst->saturate;
2609	 inst->remove();
2610	 progress = true;
2611      }
2612   }
2613
2614   return progress;
2615}
2616
2617bool
2618fs_visitor::virtual_grf_interferes(int a, int b)
2619{
2620   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
2621   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
2622
2623   /* For dead code, just check if the def interferes with the other range. */
2624   if (this->virtual_grf_use[a] == -1) {
2625      return (this->virtual_grf_def[a] >= this->virtual_grf_def[b] &&
2626	      this->virtual_grf_def[a] < this->virtual_grf_use[b]);
2627   }
2628   if (this->virtual_grf_use[b] == -1) {
2629      return (this->virtual_grf_def[b] >= this->virtual_grf_def[a] &&
2630	      this->virtual_grf_def[b] < this->virtual_grf_use[a]);
2631   }
2632
2633   return start < end;
2634}
2635
2636static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
2637{
2638   struct brw_reg brw_reg;
2639
2640   switch (reg->file) {
2641   case GRF:
2642   case ARF:
2643   case MRF:
2644      brw_reg = brw_vec8_reg(reg->file,
2645			    reg->hw_reg, 0);
2646      brw_reg = retype(brw_reg, reg->type);
2647      break;
2648   case IMM:
2649      switch (reg->type) {
2650      case BRW_REGISTER_TYPE_F:
2651	 brw_reg = brw_imm_f(reg->imm.f);
2652	 break;
2653      case BRW_REGISTER_TYPE_D:
2654	 brw_reg = brw_imm_d(reg->imm.i);
2655	 break;
2656      case BRW_REGISTER_TYPE_UD:
2657	 brw_reg = brw_imm_ud(reg->imm.u);
2658	 break;
2659      default:
2660	 assert(!"not reached");
2661	 break;
2662      }
2663      break;
2664   case FIXED_HW_REG:
2665      brw_reg = reg->fixed_hw_reg;
2666      break;
2667   case BAD_FILE:
2668      /* Probably unused. */
2669      brw_reg = brw_null_reg();
2670      break;
2671   case UNIFORM:
2672      assert(!"not reached");
2673      brw_reg = brw_null_reg();
2674      break;
2675   }
2676   if (reg->abs)
2677      brw_reg = brw_abs(brw_reg);
2678   if (reg->negate)
2679      brw_reg = negate(brw_reg);
2680
2681   return brw_reg;
2682}
2683
2684void
2685fs_visitor::generate_code()
2686{
2687   unsigned int annotation_len = 0;
2688   int last_native_inst = 0;
2689   struct brw_instruction *if_stack[16], *loop_stack[16];
2690   int if_stack_depth = 0, loop_stack_depth = 0;
2691   int if_depth_in_loop[16];
2692
2693   if_depth_in_loop[loop_stack_depth] = 0;
2694
2695   memset(&if_stack, 0, sizeof(if_stack));
2696   foreach_iter(exec_list_iterator, iter, this->instructions) {
2697      fs_inst *inst = (fs_inst *)iter.get();
2698      struct brw_reg src[3], dst;
2699
2700      for (unsigned int i = 0; i < 3; i++) {
2701	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
2702      }
2703      dst = brw_reg_from_fs_reg(&inst->dst);
2704
2705      brw_set_conditionalmod(p, inst->conditional_mod);
2706      brw_set_predicate_control(p, inst->predicated);
2707
2708      switch (inst->opcode) {
2709      case BRW_OPCODE_MOV:
2710	 brw_MOV(p, dst, src[0]);
2711	 break;
2712      case BRW_OPCODE_ADD:
2713	 brw_ADD(p, dst, src[0], src[1]);
2714	 break;
2715      case BRW_OPCODE_MUL:
2716	 brw_MUL(p, dst, src[0], src[1]);
2717	 break;
2718
2719      case BRW_OPCODE_FRC:
2720	 brw_FRC(p, dst, src[0]);
2721	 break;
2722      case BRW_OPCODE_RNDD:
2723	 brw_RNDD(p, dst, src[0]);
2724	 break;
2725      case BRW_OPCODE_RNDZ:
2726	 brw_RNDZ(p, dst, src[0]);
2727	 break;
2728
2729      case BRW_OPCODE_AND:
2730	 brw_AND(p, dst, src[0], src[1]);
2731	 break;
2732      case BRW_OPCODE_OR:
2733	 brw_OR(p, dst, src[0], src[1]);
2734	 break;
2735      case BRW_OPCODE_XOR:
2736	 brw_XOR(p, dst, src[0], src[1]);
2737	 break;
2738      case BRW_OPCODE_NOT:
2739	 brw_NOT(p, dst, src[0]);
2740	 break;
2741      case BRW_OPCODE_ASR:
2742	 brw_ASR(p, dst, src[0], src[1]);
2743	 break;
2744      case BRW_OPCODE_SHR:
2745	 brw_SHR(p, dst, src[0], src[1]);
2746	 break;
2747      case BRW_OPCODE_SHL:
2748	 brw_SHL(p, dst, src[0], src[1]);
2749	 break;
2750
2751      case BRW_OPCODE_CMP:
2752	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
2753	 break;
2754      case BRW_OPCODE_SEL:
2755	 brw_SEL(p, dst, src[0], src[1]);
2756	 break;
2757
2758      case BRW_OPCODE_IF:
2759	 assert(if_stack_depth < 16);
2760	 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
2761	 if_depth_in_loop[loop_stack_depth]++;
2762	 if_stack_depth++;
2763	 break;
2764      case BRW_OPCODE_ELSE:
2765	 if_stack[if_stack_depth - 1] =
2766	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
2767	 break;
2768      case BRW_OPCODE_ENDIF:
2769	 if_stack_depth--;
2770	 brw_ENDIF(p , if_stack[if_stack_depth]);
2771	 if_depth_in_loop[loop_stack_depth]--;
2772	 break;
2773
2774      case BRW_OPCODE_DO:
2775	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
2776	 if_depth_in_loop[loop_stack_depth] = 0;
2777	 break;
2778
2779      case BRW_OPCODE_BREAK:
2780	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
2781	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2782	 break;
2783      case BRW_OPCODE_CONTINUE:
2784	 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
2785	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2786	 break;
2787
2788      case BRW_OPCODE_WHILE: {
2789	 struct brw_instruction *inst0, *inst1;
2790	 GLuint br = 1;
2791
2792	 if (intel->gen >= 5)
2793	    br = 2;
2794
2795	 assert(loop_stack_depth > 0);
2796	 loop_stack_depth--;
2797	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
2798	 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2799	 while (inst0 > loop_stack[loop_stack_depth]) {
2800	    inst0--;
2801	    if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2802		inst0->bits3.if_else.jump_count == 0) {
2803	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2804	    }
2805	    else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2806		     inst0->bits3.if_else.jump_count == 0) {
2807	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2808	    }
2809	 }
2810      }
2811	 break;
2812
2813      case FS_OPCODE_RCP:
2814      case FS_OPCODE_RSQ:
2815      case FS_OPCODE_SQRT:
2816      case FS_OPCODE_EXP2:
2817      case FS_OPCODE_LOG2:
2818      case FS_OPCODE_POW:
2819      case FS_OPCODE_SIN:
2820      case FS_OPCODE_COS:
2821	 generate_math(inst, dst, src);
2822	 break;
2823      case FS_OPCODE_LINTERP:
2824	 generate_linterp(inst, dst, src);
2825	 break;
2826      case FS_OPCODE_TEX:
2827      case FS_OPCODE_TXB:
2828      case FS_OPCODE_TXL:
2829	 generate_tex(inst, dst);
2830	 break;
2831      case FS_OPCODE_DISCARD_NOT:
2832	 generate_discard_not(inst, dst);
2833	 break;
2834      case FS_OPCODE_DISCARD_AND:
2835	 generate_discard_and(inst, src[0]);
2836	 break;
2837      case FS_OPCODE_DDX:
2838	 generate_ddx(inst, dst, src[0]);
2839	 break;
2840      case FS_OPCODE_DDY:
2841	 generate_ddy(inst, dst, src[0]);
2842	 break;
2843      case FS_OPCODE_FB_WRITE:
2844	 generate_fb_write(inst);
2845	 break;
2846      default:
2847	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
2848	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
2849			  brw_opcodes[inst->opcode].name);
2850	 } else {
2851	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
2852	 }
2853	 this->fail = true;
2854      }
2855
2856      if (annotation_len < p->nr_insn) {
2857	 annotation_len *= 2;
2858	 if (annotation_len < 16)
2859	    annotation_len = 16;
2860
2861	 this->annotation_string = talloc_realloc(this->mem_ctx,
2862						  annotation_string,
2863						  const char *,
2864						  annotation_len);
2865	 this->annotation_ir = talloc_realloc(this->mem_ctx,
2866					      annotation_ir,
2867					      ir_instruction *,
2868					      annotation_len);
2869      }
2870
2871      for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
2872	 this->annotation_string[i] = inst->annotation;
2873	 this->annotation_ir[i] = inst->ir;
2874      }
2875      last_native_inst = p->nr_insn;
2876   }
2877}
2878
2879GLboolean
2880brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
2881{
2882   struct brw_compile *p = &c->func;
2883   struct intel_context *intel = &brw->intel;
2884   GLcontext *ctx = &intel->ctx;
2885   struct brw_shader *shader = NULL;
2886   struct gl_shader_program *prog = ctx->Shader.CurrentProgram;
2887
2888   if (!prog)
2889      return GL_FALSE;
2890
2891   if (!using_new_fs)
2892      return GL_FALSE;
2893
2894   for (unsigned int i = 0; i < prog->_NumLinkedShaders; i++) {
2895      if (prog->_LinkedShaders[i]->Type == GL_FRAGMENT_SHADER) {
2896	 shader = (struct brw_shader *)prog->_LinkedShaders[i];
2897	 break;
2898      }
2899   }
2900   if (!shader)
2901      return GL_FALSE;
2902
2903   /* We always use 8-wide mode, at least for now.  For one, flow
2904    * control only works in 8-wide.  Also, when we're fragment shader
2905    * bound, we're almost always under register pressure as well, so
2906    * 8-wide would save us from the performance cliff of spilling
2907    * regs.
2908    */
2909   c->dispatch_width = 8;
2910
2911   if (INTEL_DEBUG & DEBUG_WM) {
2912      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2913      _mesa_print_ir(shader->ir, NULL);
2914      printf("\n");
2915   }
2916
2917   /* Now the main event: Visit the shader IR and generate our FS IR for it.
2918    */
2919   fs_visitor v(c, shader);
2920
2921   if (0) {
2922      v.emit_dummy_fs();
2923   } else {
2924      v.calculate_urb_setup();
2925      if (intel->gen < 6)
2926	 v.emit_interpolation_setup_gen4();
2927      else
2928	 v.emit_interpolation_setup_gen6();
2929
2930      /* Generate FS IR for main().  (the visitor only descends into
2931       * functions called "main").
2932       */
2933      foreach_iter(exec_list_iterator, iter, *shader->ir) {
2934	 ir_instruction *ir = (ir_instruction *)iter.get();
2935	 v.base_ir = ir;
2936	 ir->accept(&v);
2937      }
2938
2939      v.emit_fb_writes();
2940      v.assign_curb_setup();
2941      v.assign_urb_setup();
2942
2943      bool progress;
2944      do {
2945	 progress = false;
2946
2947	 v.calculate_live_intervals();
2948	 progress = v.propagate_constants() || progress;
2949	 progress = v.register_coalesce() || progress;
2950	 progress = v.compute_to_mrf() || progress;
2951	 progress = v.dead_code_eliminate() || progress;
2952      } while (progress);
2953
2954      if (0)
2955	 v.assign_regs_trivial();
2956      else
2957	 v.assign_regs();
2958   }
2959
2960   if (!v.fail)
2961      v.generate_code();
2962
2963   assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
2964
2965   if (v.fail)
2966      return GL_FALSE;
2967
2968   if (INTEL_DEBUG & DEBUG_WM) {
2969      const char *last_annotation_string = NULL;
2970      ir_instruction *last_annotation_ir = NULL;
2971
2972      printf("Native code for fragment shader %d:\n", prog->Name);
2973      for (unsigned int i = 0; i < p->nr_insn; i++) {
2974	 if (last_annotation_ir != v.annotation_ir[i]) {
2975	    last_annotation_ir = v.annotation_ir[i];
2976	    if (last_annotation_ir) {
2977	       printf("   ");
2978	       last_annotation_ir->print();
2979	       printf("\n");
2980	    }
2981	 }
2982	 if (last_annotation_string != v.annotation_string[i]) {
2983	    last_annotation_string = v.annotation_string[i];
2984	    if (last_annotation_string)
2985	       printf("   %s\n", last_annotation_string);
2986	 }
2987	 brw_disasm(stdout, &p->store[i], intel->gen);
2988      }
2989      printf("\n");
2990   }
2991
2992   c->prog_data.total_grf = v.grf_used;
2993   c->prog_data.total_scratch = 0;
2994
2995   return GL_TRUE;
2996}
2997