brw_fs.cpp revision 62452e7d94a6353b59dfe0a8891d0709670dbeac
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44#include "talloc.h"
45}
46#include "brw_fs.h"
47#include "../glsl/glsl_types.h"
48#include "../glsl/ir_optimization.h"
49#include "../glsl/ir_print_visitor.h"
50
51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53struct gl_shader *
54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55{
56   struct brw_shader *shader;
57
58   shader = talloc_zero(NULL, struct brw_shader);
59   if (shader) {
60      shader->base.Type = type;
61      shader->base.Name = name;
62      _mesa_init_shader(ctx, &shader->base);
63   }
64
65   return &shader->base;
66}
67
68struct gl_shader_program *
69brw_new_shader_program(struct gl_context *ctx, GLuint name)
70{
71   struct brw_shader_program *prog;
72   prog = talloc_zero(NULL, struct brw_shader_program);
73   if (prog) {
74      prog->base.Name = name;
75      _mesa_init_shader_program(ctx, &prog->base);
76   }
77   return &prog->base;
78}
79
80GLboolean
81brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader)
82{
83   if (!_mesa_ir_compile_shader(ctx, shader))
84      return GL_FALSE;
85
86   return GL_TRUE;
87}
88
89GLboolean
90brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
91{
92   struct intel_context *intel = intel_context(ctx);
93
94   struct brw_shader *shader =
95      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
96   if (shader != NULL) {
97      void *mem_ctx = talloc_new(NULL);
98      bool progress;
99
100      if (shader->ir)
101	 talloc_free(shader->ir);
102      shader->ir = new(shader) exec_list;
103      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
104
105      do_mat_op_to_vec(shader->ir);
106      do_mod_to_fract(shader->ir);
107      do_div_to_mul_rcp(shader->ir);
108      do_sub_to_add_neg(shader->ir);
109      do_explog_to_explog2(shader->ir);
110      do_lower_texture_projection(shader->ir);
111      brw_do_cubemap_normalize(shader->ir);
112
113      do {
114	 progress = false;
115
116	 brw_do_channel_expressions(shader->ir);
117	 brw_do_vector_splitting(shader->ir);
118
119	 progress = do_lower_jumps(shader->ir, true, true,
120				   true, /* main return */
121				   false, /* continue */
122				   false /* loops */
123				   ) || progress;
124
125	 progress = do_common_optimization(shader->ir, true, 32) || progress;
126
127	 progress = lower_noise(shader->ir) || progress;
128	 progress =
129	    lower_variable_index_to_cond_assign(shader->ir,
130						GL_TRUE, /* input */
131						GL_TRUE, /* output */
132						GL_TRUE, /* temp */
133						GL_TRUE /* uniform */
134						) || progress;
135	 if (intel->gen == 6) {
136	    progress = do_if_to_cond_assign(shader->ir) || progress;
137	 }
138      } while (progress);
139
140      validate_ir_tree(shader->ir);
141
142      reparent_ir(shader->ir, shader->ir);
143      talloc_free(mem_ctx);
144   }
145
146   if (!_mesa_ir_link_shader(ctx, prog))
147      return GL_FALSE;
148
149   return GL_TRUE;
150}
151
152static int
153type_size(const struct glsl_type *type)
154{
155   unsigned int size, i;
156
157   switch (type->base_type) {
158   case GLSL_TYPE_UINT:
159   case GLSL_TYPE_INT:
160   case GLSL_TYPE_FLOAT:
161   case GLSL_TYPE_BOOL:
162      return type->components();
163   case GLSL_TYPE_ARRAY:
164      return type_size(type->fields.array) * type->length;
165   case GLSL_TYPE_STRUCT:
166      size = 0;
167      for (i = 0; i < type->length; i++) {
168	 size += type_size(type->fields.structure[i].type);
169      }
170      return size;
171   case GLSL_TYPE_SAMPLER:
172      /* Samplers take up no register space, since they're baked in at
173       * link time.
174       */
175      return 0;
176   default:
177      assert(!"not reached");
178      return 0;
179   }
180}
181
182int
183fs_visitor::virtual_grf_alloc(int size)
184{
185   if (virtual_grf_array_size <= virtual_grf_next) {
186      if (virtual_grf_array_size == 0)
187	 virtual_grf_array_size = 16;
188      else
189	 virtual_grf_array_size *= 2;
190      virtual_grf_sizes = talloc_realloc(mem_ctx, virtual_grf_sizes,
191					 int, virtual_grf_array_size);
192
193      /* This slot is always unused. */
194      virtual_grf_sizes[0] = 0;
195   }
196   virtual_grf_sizes[virtual_grf_next] = size;
197   return virtual_grf_next++;
198}
199
200/** Fixed HW reg constructor. */
201fs_reg::fs_reg(enum register_file file, int hw_reg)
202{
203   init();
204   this->file = file;
205   this->hw_reg = hw_reg;
206   this->type = BRW_REGISTER_TYPE_F;
207}
208
209/** Fixed HW reg constructor. */
210fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
211{
212   init();
213   this->file = file;
214   this->hw_reg = hw_reg;
215   this->type = type;
216}
217
218int
219brw_type_for_base_type(const struct glsl_type *type)
220{
221   switch (type->base_type) {
222   case GLSL_TYPE_FLOAT:
223      return BRW_REGISTER_TYPE_F;
224   case GLSL_TYPE_INT:
225   case GLSL_TYPE_BOOL:
226      return BRW_REGISTER_TYPE_D;
227   case GLSL_TYPE_UINT:
228      return BRW_REGISTER_TYPE_UD;
229   case GLSL_TYPE_ARRAY:
230   case GLSL_TYPE_STRUCT:
231      /* These should be overridden with the type of the member when
232       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
233       * way to trip up if we don't.
234       */
235      return BRW_REGISTER_TYPE_UD;
236   default:
237      assert(!"not reached");
238      return BRW_REGISTER_TYPE_F;
239   }
240}
241
242/** Automatic reg constructor. */
243fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
244{
245   init();
246
247   this->file = GRF;
248   this->reg = v->virtual_grf_alloc(type_size(type));
249   this->reg_offset = 0;
250   this->type = brw_type_for_base_type(type);
251}
252
253fs_reg *
254fs_visitor::variable_storage(ir_variable *var)
255{
256   return (fs_reg *)hash_table_find(this->variable_ht, var);
257}
258
259/* Our support for uniforms is piggy-backed on the struct
260 * gl_fragment_program, because that's where the values actually
261 * get stored, rather than in some global gl_shader_program uniform
262 * store.
263 */
264int
265fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
266{
267   unsigned int offset = 0;
268   float *vec_values;
269
270   if (type->is_matrix()) {
271      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
272							type->vector_elements,
273							1);
274
275      for (unsigned int i = 0; i < type->matrix_columns; i++) {
276	 offset += setup_uniform_values(loc + offset, column);
277      }
278
279      return offset;
280   }
281
282   switch (type->base_type) {
283   case GLSL_TYPE_FLOAT:
284   case GLSL_TYPE_UINT:
285   case GLSL_TYPE_INT:
286   case GLSL_TYPE_BOOL:
287      vec_values = fp->Base.Parameters->ParameterValues[loc];
288      for (unsigned int i = 0; i < type->vector_elements; i++) {
289	 assert(c->prog_data.nr_params < ARRAY_SIZE(c->prog_data.param));
290	 c->prog_data.param[c->prog_data.nr_params++] = &vec_values[i];
291      }
292      return 1;
293
294   case GLSL_TYPE_STRUCT:
295      for (unsigned int i = 0; i < type->length; i++) {
296	 offset += setup_uniform_values(loc + offset,
297					type->fields.structure[i].type);
298      }
299      return offset;
300
301   case GLSL_TYPE_ARRAY:
302      for (unsigned int i = 0; i < type->length; i++) {
303	 offset += setup_uniform_values(loc + offset, type->fields.array);
304      }
305      return offset;
306
307   case GLSL_TYPE_SAMPLER:
308      /* The sampler takes up a slot, but we don't use any values from it. */
309      return 1;
310
311   default:
312      assert(!"not reached");
313      return 0;
314   }
315}
316
317
318/* Our support for builtin uniforms is even scarier than non-builtin.
319 * It sits on top of the PROG_STATE_VAR parameters that are
320 * automatically updated from GL context state.
321 */
322void
323fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
324{
325   const struct gl_builtin_uniform_desc *statevar = NULL;
326
327   for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
328      statevar = &_mesa_builtin_uniform_desc[i];
329      if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
330	 break;
331   }
332
333   if (!statevar->name) {
334      this->fail = true;
335      printf("Failed to find builtin uniform `%s'\n", ir->name);
336      return;
337   }
338
339   int array_count;
340   if (ir->type->is_array()) {
341      array_count = ir->type->length;
342   } else {
343      array_count = 1;
344   }
345
346   for (int a = 0; a < array_count; a++) {
347      for (unsigned int i = 0; i < statevar->num_elements; i++) {
348	 struct gl_builtin_uniform_element *element = &statevar->elements[i];
349	 int tokens[STATE_LENGTH];
350
351	 memcpy(tokens, element->tokens, sizeof(element->tokens));
352	 if (ir->type->is_array()) {
353	    tokens[1] = a;
354	 }
355
356	 /* This state reference has already been setup by ir_to_mesa,
357	  * but we'll get the same index back here.
358	  */
359	 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
360					       (gl_state_index *)tokens);
361	 float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
362
363	 /* Add each of the unique swizzles of the element as a
364	  * parameter.  This'll end up matching the expected layout of
365	  * the array/matrix/structure we're trying to fill in.
366	  */
367	 int last_swiz = -1;
368	 for (unsigned int i = 0; i < 4; i++) {
369	    int swiz = GET_SWZ(element->swizzle, i);
370	    if (swiz == last_swiz)
371	       break;
372	    last_swiz = swiz;
373
374	    c->prog_data.param[c->prog_data.nr_params++] = &vec_values[swiz];
375	 }
376      }
377   }
378}
379
380fs_reg *
381fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
382{
383   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
384   fs_reg wpos = *reg;
385   fs_reg neg_y = this->pixel_y;
386   neg_y.negate = true;
387
388   /* gl_FragCoord.x */
389   if (ir->pixel_center_integer) {
390      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x));
391   } else {
392      emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)));
393   }
394   wpos.reg_offset++;
395
396   /* gl_FragCoord.y */
397   if (ir->origin_upper_left && ir->pixel_center_integer) {
398      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y));
399   } else {
400      fs_reg pixel_y = this->pixel_y;
401      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
402
403      if (!ir->origin_upper_left) {
404	 pixel_y.negate = true;
405	 offset += c->key.drawable_height - 1.0;
406      }
407
408      emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)));
409   }
410   wpos.reg_offset++;
411
412   /* gl_FragCoord.z */
413   emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
414		interp_reg(FRAG_ATTRIB_WPOS, 2)));
415   wpos.reg_offset++;
416
417   /* gl_FragCoord.w: Already set up in emit_interpolation */
418   emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w));
419
420   return reg;
421}
422
423fs_reg *
424fs_visitor::emit_general_interpolation(ir_variable *ir)
425{
426   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
427   /* Interpolation is always in floating point regs. */
428   reg->type = BRW_REGISTER_TYPE_F;
429   fs_reg attr = *reg;
430
431   unsigned int array_elements;
432   const glsl_type *type;
433
434   if (ir->type->is_array()) {
435      array_elements = ir->type->length;
436      if (array_elements == 0) {
437	 this->fail = true;
438      }
439      type = ir->type->fields.array;
440   } else {
441      array_elements = 1;
442      type = ir->type;
443   }
444
445   int location = ir->location;
446   for (unsigned int i = 0; i < array_elements; i++) {
447      for (unsigned int j = 0; j < type->matrix_columns; j++) {
448	 if (urb_setup[location] == -1) {
449	    /* If there's no incoming setup data for this slot, don't
450	     * emit interpolation for it.
451	     */
452	    attr.reg_offset += type->vector_elements;
453	    location++;
454	    continue;
455	 }
456
457	 for (unsigned int c = 0; c < type->vector_elements; c++) {
458	    struct brw_reg interp = interp_reg(location, c);
459	    emit(fs_inst(FS_OPCODE_LINTERP,
460			 attr,
461			 this->delta_x,
462			 this->delta_y,
463			 fs_reg(interp)));
464	    attr.reg_offset++;
465	 }
466
467	 if (intel->gen < 6) {
468	    attr.reg_offset -= type->vector_elements;
469	    for (unsigned int c = 0; c < type->vector_elements; c++) {
470	       emit(fs_inst(BRW_OPCODE_MUL,
471			    attr,
472			    attr,
473			    this->pixel_w));
474	       attr.reg_offset++;
475	    }
476	 }
477	 location++;
478      }
479   }
480
481   return reg;
482}
483
484fs_reg *
485fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
486{
487   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
488
489   /* The frontfacing comes in as a bit in the thread payload. */
490   if (intel->gen >= 6) {
491      emit(fs_inst(BRW_OPCODE_ASR,
492		   *reg,
493		   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
494		   fs_reg(15)));
495      emit(fs_inst(BRW_OPCODE_NOT,
496		   *reg,
497		   *reg));
498      emit(fs_inst(BRW_OPCODE_AND,
499		   *reg,
500		   *reg,
501		   fs_reg(1)));
502   } else {
503      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
504      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
505       * us front face
506       */
507      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP,
508				   *reg,
509				   fs_reg(r1_6ud),
510				   fs_reg(1u << 31)));
511      inst->conditional_mod = BRW_CONDITIONAL_L;
512      emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)));
513   }
514
515   return reg;
516}
517
518fs_inst *
519fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
520{
521   switch (opcode) {
522   case FS_OPCODE_RCP:
523   case FS_OPCODE_RSQ:
524   case FS_OPCODE_SQRT:
525   case FS_OPCODE_EXP2:
526   case FS_OPCODE_LOG2:
527   case FS_OPCODE_SIN:
528   case FS_OPCODE_COS:
529      break;
530   default:
531      assert(!"not reached: bad math opcode");
532      return NULL;
533   }
534
535   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
536    * might be able to do better by doing execsize = 1 math and then
537    * expanding that result out, but we would need to be careful with
538    * masking.
539    */
540   if (intel->gen >= 6 && src.file == UNIFORM) {
541      fs_reg expanded = fs_reg(this, glsl_type::float_type);
542      emit(fs_inst(BRW_OPCODE_MOV, expanded, src));
543      src = expanded;
544   }
545
546   fs_inst *inst = emit(fs_inst(opcode, dst, src));
547
548   if (intel->gen < 6) {
549      inst->base_mrf = 2;
550      inst->mlen = 1;
551   }
552
553   return inst;
554}
555
556fs_inst *
557fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
558{
559   int base_mrf = 2;
560   fs_inst *inst;
561
562   assert(opcode == FS_OPCODE_POW);
563
564   if (intel->gen >= 6) {
565      /* Can't do hstride == 0 args to gen6 math, so expand it out. */
566      if (src0.file == UNIFORM) {
567	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
568	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src0));
569	 src0 = expanded;
570      }
571
572      if (src1.file == UNIFORM) {
573	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
574	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src1));
575	 src1 = expanded;
576      }
577
578      inst = emit(fs_inst(opcode, dst, src0, src1));
579   } else {
580      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1));
581      inst = emit(fs_inst(opcode, dst, src0, reg_null_f));
582
583      inst->base_mrf = base_mrf;
584      inst->mlen = 2;
585   }
586   return inst;
587}
588
589void
590fs_visitor::visit(ir_variable *ir)
591{
592   fs_reg *reg = NULL;
593
594   if (variable_storage(ir))
595      return;
596
597   if (strcmp(ir->name, "gl_FragColor") == 0) {
598      this->frag_color = ir;
599   } else if (strcmp(ir->name, "gl_FragData") == 0) {
600      this->frag_data = ir;
601   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
602      this->frag_depth = ir;
603   }
604
605   if (ir->mode == ir_var_in) {
606      if (!strcmp(ir->name, "gl_FragCoord")) {
607	 reg = emit_fragcoord_interpolation(ir);
608      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
609	 reg = emit_frontfacing_interpolation(ir);
610      } else {
611	 reg = emit_general_interpolation(ir);
612      }
613      assert(reg);
614      hash_table_insert(this->variable_ht, reg, ir);
615      return;
616   }
617
618   if (ir->mode == ir_var_uniform) {
619      int param_index = c->prog_data.nr_params;
620
621      if (!strncmp(ir->name, "gl_", 3)) {
622	 setup_builtin_uniform_values(ir);
623      } else {
624	 setup_uniform_values(ir->location, ir->type);
625      }
626
627      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
628   }
629
630   if (!reg)
631      reg = new(this->mem_ctx) fs_reg(this, ir->type);
632
633   hash_table_insert(this->variable_ht, reg, ir);
634}
635
636void
637fs_visitor::visit(ir_dereference_variable *ir)
638{
639   fs_reg *reg = variable_storage(ir->var);
640   this->result = *reg;
641}
642
643void
644fs_visitor::visit(ir_dereference_record *ir)
645{
646   const glsl_type *struct_type = ir->record->type;
647
648   ir->record->accept(this);
649
650   unsigned int offset = 0;
651   for (unsigned int i = 0; i < struct_type->length; i++) {
652      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
653	 break;
654      offset += type_size(struct_type->fields.structure[i].type);
655   }
656   this->result.reg_offset += offset;
657   this->result.type = brw_type_for_base_type(ir->type);
658}
659
660void
661fs_visitor::visit(ir_dereference_array *ir)
662{
663   ir_constant *index;
664   int element_size;
665
666   ir->array->accept(this);
667   index = ir->array_index->as_constant();
668
669   element_size = type_size(ir->type);
670   this->result.type = brw_type_for_base_type(ir->type);
671
672   if (index) {
673      assert(this->result.file == UNIFORM ||
674	     (this->result.file == GRF &&
675	      this->result.reg != 0));
676      this->result.reg_offset += index->value.i[0] * element_size;
677   } else {
678      assert(!"FINISHME: non-constant array element");
679   }
680}
681
682void
683fs_visitor::visit(ir_expression *ir)
684{
685   unsigned int operand;
686   fs_reg op[2], temp;
687   fs_inst *inst;
688
689   for (operand = 0; operand < ir->get_num_operands(); operand++) {
690      ir->operands[operand]->accept(this);
691      if (this->result.file == BAD_FILE) {
692	 ir_print_visitor v;
693	 printf("Failed to get tree for expression operand:\n");
694	 ir->operands[operand]->accept(&v);
695	 this->fail = true;
696      }
697      op[operand] = this->result;
698
699      /* Matrix expression operands should have been broken down to vector
700       * operations already.
701       */
702      assert(!ir->operands[operand]->type->is_matrix());
703      /* And then those vector operands should have been broken down to scalar.
704       */
705      assert(!ir->operands[operand]->type->is_vector());
706   }
707
708   /* Storage for our result.  If our result goes into an assignment, it will
709    * just get copy-propagated out, so no worries.
710    */
711   this->result = fs_reg(this, ir->type);
712
713   switch (ir->operation) {
714   case ir_unop_logic_not:
715      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
716       * ones complement of the whole register, not just bit 0.
717       */
718      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)));
719      break;
720   case ir_unop_neg:
721      op[0].negate = !op[0].negate;
722      this->result = op[0];
723      break;
724   case ir_unop_abs:
725      op[0].abs = true;
726      this->result = op[0];
727      break;
728   case ir_unop_sign:
729      temp = fs_reg(this, ir->type);
730
731      emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)));
732
733      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
734      inst->conditional_mod = BRW_CONDITIONAL_G;
735      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)));
736      inst->predicated = true;
737
738      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
739      inst->conditional_mod = BRW_CONDITIONAL_L;
740      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)));
741      inst->predicated = true;
742
743      break;
744   case ir_unop_rcp:
745      emit_math(FS_OPCODE_RCP, this->result, op[0]);
746      break;
747
748   case ir_unop_exp2:
749      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
750      break;
751   case ir_unop_log2:
752      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
753      break;
754   case ir_unop_exp:
755   case ir_unop_log:
756      assert(!"not reached: should be handled by ir_explog_to_explog2");
757      break;
758   case ir_unop_sin:
759      emit_math(FS_OPCODE_SIN, this->result, op[0]);
760      break;
761   case ir_unop_cos:
762      emit_math(FS_OPCODE_COS, this->result, op[0]);
763      break;
764
765   case ir_unop_dFdx:
766      emit(fs_inst(FS_OPCODE_DDX, this->result, op[0]));
767      break;
768   case ir_unop_dFdy:
769      emit(fs_inst(FS_OPCODE_DDY, this->result, op[0]));
770      break;
771
772   case ir_binop_add:
773      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1]));
774      break;
775   case ir_binop_sub:
776      assert(!"not reached: should be handled by ir_sub_to_add_neg");
777      break;
778
779   case ir_binop_mul:
780      emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1]));
781      break;
782   case ir_binop_div:
783      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
784      break;
785   case ir_binop_mod:
786      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
787      break;
788
789   case ir_binop_less:
790      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
791      inst->conditional_mod = BRW_CONDITIONAL_L;
792      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
793      break;
794   case ir_binop_greater:
795      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
796      inst->conditional_mod = BRW_CONDITIONAL_G;
797      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
798      break;
799   case ir_binop_lequal:
800      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
801      inst->conditional_mod = BRW_CONDITIONAL_LE;
802      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
803      break;
804   case ir_binop_gequal:
805      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
806      inst->conditional_mod = BRW_CONDITIONAL_GE;
807      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
808      break;
809   case ir_binop_equal:
810   case ir_binop_all_equal: /* same as nequal for scalars */
811      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
812      inst->conditional_mod = BRW_CONDITIONAL_Z;
813      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
814      break;
815   case ir_binop_nequal:
816   case ir_binop_any_nequal: /* same as nequal for scalars */
817      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
818      inst->conditional_mod = BRW_CONDITIONAL_NZ;
819      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
820      break;
821
822   case ir_binop_logic_xor:
823      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
824      break;
825
826   case ir_binop_logic_or:
827      emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
828      break;
829
830   case ir_binop_logic_and:
831      emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
832      break;
833
834   case ir_binop_dot:
835   case ir_binop_cross:
836   case ir_unop_any:
837      assert(!"not reached: should be handled by brw_fs_channel_expressions");
838      break;
839
840   case ir_unop_noise:
841      assert(!"not reached: should be handled by lower_noise");
842      break;
843
844   case ir_unop_sqrt:
845      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
846      break;
847
848   case ir_unop_rsq:
849      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
850      break;
851
852   case ir_unop_i2f:
853   case ir_unop_b2f:
854   case ir_unop_b2i:
855   case ir_unop_f2i:
856      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
857      break;
858   case ir_unop_f2b:
859   case ir_unop_i2b:
860      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f)));
861      inst->conditional_mod = BRW_CONDITIONAL_NZ;
862      inst = emit(fs_inst(BRW_OPCODE_AND, this->result,
863			  this->result, fs_reg(1)));
864      break;
865
866   case ir_unop_trunc:
867      emit(fs_inst(BRW_OPCODE_RNDZ, this->result, op[0]));
868      break;
869   case ir_unop_ceil:
870      op[0].negate = !op[0].negate;
871      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
872      this->result.negate = true;
873      break;
874   case ir_unop_floor:
875      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
876      break;
877   case ir_unop_fract:
878      inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0]));
879      break;
880   case ir_unop_round_even:
881      emit(fs_inst(BRW_OPCODE_RNDE, this->result, op[0]));
882      break;
883
884   case ir_binop_min:
885      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
886      inst->conditional_mod = BRW_CONDITIONAL_L;
887
888      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
889      inst->predicated = true;
890      break;
891   case ir_binop_max:
892      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
893      inst->conditional_mod = BRW_CONDITIONAL_G;
894
895      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
896      inst->predicated = true;
897      break;
898
899   case ir_binop_pow:
900      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
901      break;
902
903   case ir_unop_bit_not:
904   case ir_unop_u2f:
905   case ir_binop_lshift:
906   case ir_binop_rshift:
907   case ir_binop_bit_and:
908   case ir_binop_bit_xor:
909   case ir_binop_bit_or:
910      assert(!"GLSL 1.30 features unsupported");
911      break;
912   }
913}
914
915void
916fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
917				   const glsl_type *type, bool predicated)
918{
919   switch (type->base_type) {
920   case GLSL_TYPE_FLOAT:
921   case GLSL_TYPE_UINT:
922   case GLSL_TYPE_INT:
923   case GLSL_TYPE_BOOL:
924      for (unsigned int i = 0; i < type->components(); i++) {
925	 l.type = brw_type_for_base_type(type);
926	 r.type = brw_type_for_base_type(type);
927
928	 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
929	 inst->predicated = predicated;
930
931	 l.reg_offset++;
932	 r.reg_offset++;
933      }
934      break;
935   case GLSL_TYPE_ARRAY:
936      for (unsigned int i = 0; i < type->length; i++) {
937	 emit_assignment_writes(l, r, type->fields.array, predicated);
938      }
939      break;
940
941   case GLSL_TYPE_STRUCT:
942      for (unsigned int i = 0; i < type->length; i++) {
943	 emit_assignment_writes(l, r, type->fields.structure[i].type,
944				predicated);
945      }
946      break;
947
948   case GLSL_TYPE_SAMPLER:
949      break;
950
951   default:
952      assert(!"not reached");
953      break;
954   }
955}
956
957void
958fs_visitor::visit(ir_assignment *ir)
959{
960   struct fs_reg l, r;
961   fs_inst *inst;
962
963   /* FINISHME: arrays on the lhs */
964   ir->lhs->accept(this);
965   l = this->result;
966
967   ir->rhs->accept(this);
968   r = this->result;
969
970   assert(l.file != BAD_FILE);
971   assert(r.file != BAD_FILE);
972
973   if (ir->condition) {
974      emit_bool_to_cond_code(ir->condition);
975   }
976
977   if (ir->lhs->type->is_scalar() ||
978       ir->lhs->type->is_vector()) {
979      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
980	 if (ir->write_mask & (1 << i)) {
981	    inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
982	    if (ir->condition)
983	       inst->predicated = true;
984	    r.reg_offset++;
985	 }
986	 l.reg_offset++;
987      }
988   } else {
989      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
990   }
991}
992
993fs_inst *
994fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
995{
996   int mlen;
997   int base_mrf = 1;
998   bool simd16 = false;
999   fs_reg orig_dst;
1000
1001   /* g0 header. */
1002   mlen = 1;
1003
1004   if (ir->shadow_comparitor) {
1005      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1006	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1007		      coordinate));
1008	 coordinate.reg_offset++;
1009      }
1010      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1011      mlen += 3;
1012
1013      if (ir->op == ir_tex) {
1014	 /* There's no plain shadow compare message, so we use shadow
1015	  * compare with a bias of 0.0.
1016	  */
1017	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1018		      fs_reg(0.0f)));
1019	 mlen++;
1020      } else if (ir->op == ir_txb) {
1021	 ir->lod_info.bias->accept(this);
1022	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1023		      this->result));
1024	 mlen++;
1025      } else {
1026	 assert(ir->op == ir_txl);
1027	 ir->lod_info.lod->accept(this);
1028	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1029		      this->result));
1030	 mlen++;
1031      }
1032
1033      ir->shadow_comparitor->accept(this);
1034      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1035      mlen++;
1036   } else if (ir->op == ir_tex) {
1037      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1038	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1039		      coordinate));
1040	 coordinate.reg_offset++;
1041      }
1042      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1043      mlen += 3;
1044   } else {
1045      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1046       * instructions.  We'll need to do SIMD16 here.
1047       */
1048      assert(ir->op == ir_txb || ir->op == ir_txl);
1049
1050      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1051	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2),
1052		      coordinate));
1053	 coordinate.reg_offset++;
1054      }
1055
1056      /* lod/bias appears after u/v/r. */
1057      mlen += 6;
1058
1059      if (ir->op == ir_txb) {
1060	 ir->lod_info.bias->accept(this);
1061	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1062		      this->result));
1063	 mlen++;
1064      } else {
1065	 ir->lod_info.lod->accept(this);
1066	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1067		      this->result));
1068	 mlen++;
1069      }
1070
1071      /* The unused upper half. */
1072      mlen++;
1073
1074      /* Now, since we're doing simd16, the return is 2 interleaved
1075       * vec4s where the odd-indexed ones are junk. We'll need to move
1076       * this weirdness around to the expected layout.
1077       */
1078      simd16 = true;
1079      orig_dst = dst;
1080      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1081						       2));
1082      dst.type = BRW_REGISTER_TYPE_F;
1083   }
1084
1085   fs_inst *inst = NULL;
1086   switch (ir->op) {
1087   case ir_tex:
1088      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1089      break;
1090   case ir_txb:
1091      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1092      break;
1093   case ir_txl:
1094      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1095      break;
1096   case ir_txd:
1097   case ir_txf:
1098      assert(!"GLSL 1.30 features unsupported");
1099      break;
1100   }
1101   inst->base_mrf = base_mrf;
1102   inst->mlen = mlen;
1103
1104   if (simd16) {
1105      for (int i = 0; i < 4; i++) {
1106	 emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst));
1107	 orig_dst.reg_offset++;
1108	 dst.reg_offset += 2;
1109      }
1110   }
1111
1112   return inst;
1113}
1114
1115fs_inst *
1116fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1117{
1118   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1119    * optional parameters like shadow comparitor or LOD bias.  If
1120    * optional parameters aren't present, those base slots are
1121    * optional and don't need to be included in the message.
1122    *
1123    * We don't fill in the unnecessary slots regardless, which may
1124    * look surprising in the disassembly.
1125    */
1126   int mlen = 1; /* g0 header always present. */
1127   int base_mrf = 1;
1128
1129   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1130      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1131		   coordinate));
1132      coordinate.reg_offset++;
1133   }
1134   mlen += ir->coordinate->type->vector_elements;
1135
1136   if (ir->shadow_comparitor) {
1137      mlen = MAX2(mlen, 5);
1138
1139      ir->shadow_comparitor->accept(this);
1140      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1141      mlen++;
1142   }
1143
1144   fs_inst *inst = NULL;
1145   switch (ir->op) {
1146   case ir_tex:
1147      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1148      break;
1149   case ir_txb:
1150      ir->lod_info.bias->accept(this);
1151      mlen = MAX2(mlen, 5);
1152      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1153      mlen++;
1154
1155      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1156      break;
1157   case ir_txl:
1158      ir->lod_info.lod->accept(this);
1159      mlen = MAX2(mlen, 5);
1160      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1161      mlen++;
1162
1163      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1164      break;
1165   case ir_txd:
1166   case ir_txf:
1167      assert(!"GLSL 1.30 features unsupported");
1168      break;
1169   }
1170   inst->base_mrf = base_mrf;
1171   inst->mlen = mlen;
1172
1173   return inst;
1174}
1175
1176void
1177fs_visitor::visit(ir_texture *ir)
1178{
1179   int sampler;
1180   fs_inst *inst = NULL;
1181
1182   ir->coordinate->accept(this);
1183   fs_reg coordinate = this->result;
1184
1185   /* Should be lowered by do_lower_texture_projection */
1186   assert(!ir->projector);
1187
1188   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1189					     ctx->Shader.CurrentProgram,
1190					     &brw->fragment_program->Base);
1191   sampler = c->fp->program.Base.SamplerUnits[sampler];
1192
1193   /* The 965 requires the EU to do the normalization of GL rectangle
1194    * texture coordinates.  We use the program parameter state
1195    * tracking to get the scaling factor.
1196    */
1197   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1198      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1199      int tokens[STATE_LENGTH] = {
1200	 STATE_INTERNAL,
1201	 STATE_TEXRECT_SCALE,
1202	 sampler,
1203	 0,
1204	 0
1205      };
1206
1207      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1208      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1209      GLuint index = _mesa_add_state_reference(params,
1210					       (gl_state_index *)tokens);
1211      float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
1212
1213      c->prog_data.param[c->prog_data.nr_params++] = &vec_values[0];
1214      c->prog_data.param[c->prog_data.nr_params++] = &vec_values[1];
1215
1216      fs_reg dst = fs_reg(this, ir->coordinate->type);
1217      fs_reg src = coordinate;
1218      coordinate = dst;
1219
1220      emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_x));
1221      dst.reg_offset++;
1222      src.reg_offset++;
1223      emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_y));
1224   }
1225
1226   /* Writemasking doesn't eliminate channels on SIMD8 texture
1227    * samples, so don't worry about them.
1228    */
1229   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1230
1231   if (intel->gen < 5) {
1232      inst = emit_texture_gen4(ir, dst, coordinate);
1233   } else {
1234      inst = emit_texture_gen5(ir, dst, coordinate);
1235   }
1236
1237   inst->sampler = sampler;
1238
1239   this->result = dst;
1240
1241   if (ir->shadow_comparitor)
1242      inst->shadow_compare = true;
1243
1244   if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1245      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1246
1247      for (int i = 0; i < 4; i++) {
1248	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1249	 fs_reg l = swizzle_dst;
1250	 l.reg_offset += i;
1251
1252	 if (swiz == SWIZZLE_ZERO) {
1253	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f)));
1254	 } else if (swiz == SWIZZLE_ONE) {
1255	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f)));
1256	 } else {
1257	    fs_reg r = dst;
1258	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1259	    emit(fs_inst(BRW_OPCODE_MOV, l, r));
1260	 }
1261      }
1262      this->result = swizzle_dst;
1263   }
1264}
1265
1266void
1267fs_visitor::visit(ir_swizzle *ir)
1268{
1269   ir->val->accept(this);
1270   fs_reg val = this->result;
1271
1272   if (ir->type->vector_elements == 1) {
1273      this->result.reg_offset += ir->mask.x;
1274      return;
1275   }
1276
1277   fs_reg result = fs_reg(this, ir->type);
1278   this->result = result;
1279
1280   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1281      fs_reg channel = val;
1282      int swiz = 0;
1283
1284      switch (i) {
1285      case 0:
1286	 swiz = ir->mask.x;
1287	 break;
1288      case 1:
1289	 swiz = ir->mask.y;
1290	 break;
1291      case 2:
1292	 swiz = ir->mask.z;
1293	 break;
1294      case 3:
1295	 swiz = ir->mask.w;
1296	 break;
1297      }
1298
1299      channel.reg_offset += swiz;
1300      emit(fs_inst(BRW_OPCODE_MOV, result, channel));
1301      result.reg_offset++;
1302   }
1303}
1304
1305void
1306fs_visitor::visit(ir_discard *ir)
1307{
1308   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1309
1310   assert(ir->condition == NULL); /* FINISHME */
1311
1312   emit(fs_inst(FS_OPCODE_DISCARD_NOT, temp, reg_null_d));
1313   emit(fs_inst(FS_OPCODE_DISCARD_AND, reg_null_d, temp));
1314   kill_emitted = true;
1315}
1316
1317void
1318fs_visitor::visit(ir_constant *ir)
1319{
1320   fs_reg reg(this, ir->type);
1321   this->result = reg;
1322
1323   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1324      switch (ir->type->base_type) {
1325      case GLSL_TYPE_FLOAT:
1326	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.f[i])));
1327	 break;
1328      case GLSL_TYPE_UINT:
1329	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.u[i])));
1330	 break;
1331      case GLSL_TYPE_INT:
1332	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.i[i])));
1333	 break;
1334      case GLSL_TYPE_BOOL:
1335	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg((int)ir->value.b[i])));
1336	 break;
1337      default:
1338	 assert(!"Non-float/uint/int/bool constant");
1339      }
1340      reg.reg_offset++;
1341   }
1342}
1343
1344void
1345fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1346{
1347   ir_expression *expr = ir->as_expression();
1348
1349   if (expr) {
1350      fs_reg op[2];
1351      fs_inst *inst;
1352
1353      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1354	 assert(expr->operands[i]->type->is_scalar());
1355
1356	 expr->operands[i]->accept(this);
1357	 op[i] = this->result;
1358      }
1359
1360      switch (expr->operation) {
1361      case ir_unop_logic_not:
1362	 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)));
1363	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1364	 break;
1365
1366      case ir_binop_logic_xor:
1367	 inst = emit(fs_inst(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]));
1368	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1369	 break;
1370
1371      case ir_binop_logic_or:
1372	 inst = emit(fs_inst(BRW_OPCODE_OR, reg_null_d, op[0], op[1]));
1373	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1374	 break;
1375
1376      case ir_binop_logic_and:
1377	 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], op[1]));
1378	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1379	 break;
1380
1381      case ir_unop_f2b:
1382	 if (intel->gen >= 6) {
1383	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d,
1384				op[0], fs_reg(0.0f)));
1385	 } else {
1386	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0]));
1387	 }
1388	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1389	 break;
1390
1391      case ir_unop_i2b:
1392	 if (intel->gen >= 6) {
1393	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)));
1394	 } else {
1395	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0]));
1396	 }
1397	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1398	 break;
1399
1400      case ir_binop_greater:
1401	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1402	 inst->conditional_mod = BRW_CONDITIONAL_G;
1403	 break;
1404      case ir_binop_gequal:
1405	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1406	 inst->conditional_mod = BRW_CONDITIONAL_GE;
1407	 break;
1408      case ir_binop_less:
1409	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1410	 inst->conditional_mod = BRW_CONDITIONAL_L;
1411	 break;
1412      case ir_binop_lequal:
1413	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1414	 inst->conditional_mod = BRW_CONDITIONAL_LE;
1415	 break;
1416      case ir_binop_equal:
1417      case ir_binop_all_equal:
1418	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1419	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1420	 break;
1421      case ir_binop_nequal:
1422      case ir_binop_any_nequal:
1423	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
1424	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1425	 break;
1426      default:
1427	 assert(!"not reached");
1428	 this->fail = true;
1429	 break;
1430      }
1431      return;
1432   }
1433
1434   ir->accept(this);
1435
1436   if (intel->gen >= 6) {
1437      fs_inst *inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d,
1438				   this->result, fs_reg(1)));
1439      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1440   } else {
1441      fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, this->result));
1442      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1443   }
1444}
1445
1446/**
1447 * Emit a gen6 IF statement with the comparison folded into the IF
1448 * instruction.
1449 */
1450void
1451fs_visitor::emit_if_gen6(ir_if *ir)
1452{
1453   ir_expression *expr = ir->condition->as_expression();
1454
1455   if (expr) {
1456      fs_reg op[2];
1457      fs_inst *inst;
1458      fs_reg temp;
1459
1460      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1461	 assert(expr->operands[i]->type->is_scalar());
1462
1463	 expr->operands[i]->accept(this);
1464	 op[i] = this->result;
1465      }
1466
1467      switch (expr->operation) {
1468      case ir_unop_logic_not:
1469	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(1)));
1470	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1471	 return;
1472
1473      case ir_binop_logic_xor:
1474	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1475	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1476	 return;
1477
1478      case ir_binop_logic_or:
1479	 temp = fs_reg(this, glsl_type::bool_type);
1480	 emit(fs_inst(BRW_OPCODE_OR, temp, op[0], op[1]));
1481	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1482	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1483	 return;
1484
1485      case ir_binop_logic_and:
1486	 temp = fs_reg(this, glsl_type::bool_type);
1487	 emit(fs_inst(BRW_OPCODE_AND, temp, op[0], op[1]));
1488	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1489	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1490	 return;
1491
1492      case ir_unop_f2b:
1493	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)));
1494	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1495	 return;
1496
1497      case ir_unop_i2b:
1498	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1499	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1500	 return;
1501
1502      case ir_binop_greater:
1503	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1504	 inst->conditional_mod = BRW_CONDITIONAL_G;
1505	 return;
1506      case ir_binop_gequal:
1507	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1508	 inst->conditional_mod = BRW_CONDITIONAL_GE;
1509	 return;
1510      case ir_binop_less:
1511	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1512	 inst->conditional_mod = BRW_CONDITIONAL_L;
1513	 return;
1514      case ir_binop_lequal:
1515	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1516	 inst->conditional_mod = BRW_CONDITIONAL_LE;
1517	 return;
1518      case ir_binop_equal:
1519      case ir_binop_all_equal:
1520	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1521	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1522	 return;
1523      case ir_binop_nequal:
1524      case ir_binop_any_nequal:
1525	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1526	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1527	 return;
1528      default:
1529	 assert(!"not reached");
1530	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1531	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1532	 this->fail = true;
1533	 return;
1534      }
1535      return;
1536   }
1537
1538   ir->condition->accept(this);
1539
1540   fs_inst *inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)));
1541   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1542}
1543
1544void
1545fs_visitor::visit(ir_if *ir)
1546{
1547   fs_inst *inst;
1548
1549   /* Don't point the annotation at the if statement, because then it plus
1550    * the then and else blocks get printed.
1551    */
1552   this->base_ir = ir->condition;
1553
1554   if (intel->gen >= 6) {
1555      emit_if_gen6(ir);
1556   } else {
1557      emit_bool_to_cond_code(ir->condition);
1558
1559      inst = emit(fs_inst(BRW_OPCODE_IF));
1560      inst->predicated = true;
1561   }
1562
1563   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1564      ir_instruction *ir = (ir_instruction *)iter.get();
1565      this->base_ir = ir;
1566
1567      ir->accept(this);
1568   }
1569
1570   if (!ir->else_instructions.is_empty()) {
1571      emit(fs_inst(BRW_OPCODE_ELSE));
1572
1573      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1574	 ir_instruction *ir = (ir_instruction *)iter.get();
1575	 this->base_ir = ir;
1576
1577	 ir->accept(this);
1578      }
1579   }
1580
1581   emit(fs_inst(BRW_OPCODE_ENDIF));
1582}
1583
1584void
1585fs_visitor::visit(ir_loop *ir)
1586{
1587   fs_reg counter = reg_undef;
1588
1589   if (ir->counter) {
1590      this->base_ir = ir->counter;
1591      ir->counter->accept(this);
1592      counter = *(variable_storage(ir->counter));
1593
1594      if (ir->from) {
1595	 this->base_ir = ir->from;
1596	 ir->from->accept(this);
1597
1598	 emit(fs_inst(BRW_OPCODE_MOV, counter, this->result));
1599      }
1600   }
1601
1602   emit(fs_inst(BRW_OPCODE_DO));
1603
1604   if (ir->to) {
1605      this->base_ir = ir->to;
1606      ir->to->accept(this);
1607
1608      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d,
1609				   counter, this->result));
1610      switch (ir->cmp) {
1611      case ir_binop_equal:
1612	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1613	 break;
1614      case ir_binop_nequal:
1615	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1616	 break;
1617      case ir_binop_gequal:
1618	 inst->conditional_mod = BRW_CONDITIONAL_GE;
1619	 break;
1620      case ir_binop_lequal:
1621	 inst->conditional_mod = BRW_CONDITIONAL_LE;
1622	 break;
1623      case ir_binop_greater:
1624	 inst->conditional_mod = BRW_CONDITIONAL_G;
1625	 break;
1626      case ir_binop_less:
1627	 inst->conditional_mod = BRW_CONDITIONAL_L;
1628	 break;
1629      default:
1630	 assert(!"not reached: unknown loop condition");
1631	 this->fail = true;
1632	 break;
1633      }
1634
1635      inst = emit(fs_inst(BRW_OPCODE_BREAK));
1636      inst->predicated = true;
1637   }
1638
1639   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1640      ir_instruction *ir = (ir_instruction *)iter.get();
1641
1642      this->base_ir = ir;
1643      ir->accept(this);
1644   }
1645
1646   if (ir->increment) {
1647      this->base_ir = ir->increment;
1648      ir->increment->accept(this);
1649      emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result));
1650   }
1651
1652   emit(fs_inst(BRW_OPCODE_WHILE));
1653}
1654
1655void
1656fs_visitor::visit(ir_loop_jump *ir)
1657{
1658   switch (ir->mode) {
1659   case ir_loop_jump::jump_break:
1660      emit(fs_inst(BRW_OPCODE_BREAK));
1661      break;
1662   case ir_loop_jump::jump_continue:
1663      emit(fs_inst(BRW_OPCODE_CONTINUE));
1664      break;
1665   }
1666}
1667
1668void
1669fs_visitor::visit(ir_call *ir)
1670{
1671   assert(!"FINISHME");
1672}
1673
1674void
1675fs_visitor::visit(ir_return *ir)
1676{
1677   assert(!"FINISHME");
1678}
1679
1680void
1681fs_visitor::visit(ir_function *ir)
1682{
1683   /* Ignore function bodies other than main() -- we shouldn't see calls to
1684    * them since they should all be inlined before we get to ir_to_mesa.
1685    */
1686   if (strcmp(ir->name, "main") == 0) {
1687      const ir_function_signature *sig;
1688      exec_list empty;
1689
1690      sig = ir->matching_signature(&empty);
1691
1692      assert(sig);
1693
1694      foreach_iter(exec_list_iterator, iter, sig->body) {
1695	 ir_instruction *ir = (ir_instruction *)iter.get();
1696	 this->base_ir = ir;
1697
1698	 ir->accept(this);
1699      }
1700   }
1701}
1702
1703void
1704fs_visitor::visit(ir_function_signature *ir)
1705{
1706   assert(!"not reached");
1707   (void)ir;
1708}
1709
1710fs_inst *
1711fs_visitor::emit(fs_inst inst)
1712{
1713   fs_inst *list_inst = new(mem_ctx) fs_inst;
1714   *list_inst = inst;
1715
1716   list_inst->annotation = this->current_annotation;
1717   list_inst->ir = this->base_ir;
1718
1719   this->instructions.push_tail(list_inst);
1720
1721   return list_inst;
1722}
1723
1724/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1725void
1726fs_visitor::emit_dummy_fs()
1727{
1728   /* Everyone's favorite color. */
1729   emit(fs_inst(BRW_OPCODE_MOV,
1730		fs_reg(MRF, 2),
1731		fs_reg(1.0f)));
1732   emit(fs_inst(BRW_OPCODE_MOV,
1733		fs_reg(MRF, 3),
1734		fs_reg(0.0f)));
1735   emit(fs_inst(BRW_OPCODE_MOV,
1736		fs_reg(MRF, 4),
1737		fs_reg(1.0f)));
1738   emit(fs_inst(BRW_OPCODE_MOV,
1739		fs_reg(MRF, 5),
1740		fs_reg(0.0f)));
1741
1742   fs_inst *write;
1743   write = emit(fs_inst(FS_OPCODE_FB_WRITE,
1744			fs_reg(0),
1745			fs_reg(0)));
1746   write->base_mrf = 0;
1747}
1748
1749/* The register location here is relative to the start of the URB
1750 * data.  It will get adjusted to be a real location before
1751 * generate_code() time.
1752 */
1753struct brw_reg
1754fs_visitor::interp_reg(int location, int channel)
1755{
1756   int regnr = urb_setup[location] * 2 + channel / 2;
1757   int stride = (channel & 1) * 4;
1758
1759   assert(urb_setup[location] != -1);
1760
1761   return brw_vec1_grf(regnr, stride);
1762}
1763
1764/** Emits the interpolation for the varying inputs. */
1765void
1766fs_visitor::emit_interpolation_setup_gen4()
1767{
1768   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1769
1770   this->current_annotation = "compute pixel centers";
1771   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1772   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1773   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1774   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1775   emit(fs_inst(BRW_OPCODE_ADD,
1776		this->pixel_x,
1777		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1778		fs_reg(brw_imm_v(0x10101010))));
1779   emit(fs_inst(BRW_OPCODE_ADD,
1780		this->pixel_y,
1781		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1782		fs_reg(brw_imm_v(0x11001100))));
1783
1784   this->current_annotation = "compute pixel deltas from v0";
1785   if (brw->has_pln) {
1786      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1787      this->delta_y = this->delta_x;
1788      this->delta_y.reg_offset++;
1789   } else {
1790      this->delta_x = fs_reg(this, glsl_type::float_type);
1791      this->delta_y = fs_reg(this, glsl_type::float_type);
1792   }
1793   emit(fs_inst(BRW_OPCODE_ADD,
1794		this->delta_x,
1795		this->pixel_x,
1796		fs_reg(negate(brw_vec1_grf(1, 0)))));
1797   emit(fs_inst(BRW_OPCODE_ADD,
1798		this->delta_y,
1799		this->pixel_y,
1800		fs_reg(negate(brw_vec1_grf(1, 1)))));
1801
1802   this->current_annotation = "compute pos.w and 1/pos.w";
1803   /* Compute wpos.w.  It's always in our setup, since it's needed to
1804    * interpolate the other attributes.
1805    */
1806   this->wpos_w = fs_reg(this, glsl_type::float_type);
1807   emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1808		interp_reg(FRAG_ATTRIB_WPOS, 3)));
1809   /* Compute the pixel 1/W value from wpos.w. */
1810   this->pixel_w = fs_reg(this, glsl_type::float_type);
1811   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1812   this->current_annotation = NULL;
1813}
1814
1815/** Emits the interpolation for the varying inputs. */
1816void
1817fs_visitor::emit_interpolation_setup_gen6()
1818{
1819   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1820
1821   /* If the pixel centers end up used, the setup is the same as for gen4. */
1822   this->current_annotation = "compute pixel centers";
1823   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1824   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1825   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1826   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1827   emit(fs_inst(BRW_OPCODE_ADD,
1828		int_pixel_x,
1829		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1830		fs_reg(brw_imm_v(0x10101010))));
1831   emit(fs_inst(BRW_OPCODE_ADD,
1832		int_pixel_y,
1833		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1834		fs_reg(brw_imm_v(0x11001100))));
1835
1836   /* As of gen6, we can no longer mix float and int sources.  We have
1837    * to turn the integer pixel centers into floats for their actual
1838    * use.
1839    */
1840   this->pixel_x = fs_reg(this, glsl_type::float_type);
1841   this->pixel_y = fs_reg(this, glsl_type::float_type);
1842   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x));
1843   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y));
1844
1845   this->current_annotation = "compute 1/pos.w";
1846   this->wpos_w = fs_reg(brw_vec8_grf(c->key.source_w_reg, 0));
1847   this->pixel_w = fs_reg(this, glsl_type::float_type);
1848   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1849
1850   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
1851   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
1852
1853   this->current_annotation = NULL;
1854}
1855
1856void
1857fs_visitor::emit_fb_writes()
1858{
1859   this->current_annotation = "FB write header";
1860   GLboolean header_present = GL_TRUE;
1861   int nr = 0;
1862
1863   if (intel->gen >= 6 &&
1864       !this->kill_emitted &&
1865       c->key.nr_color_regions == 1) {
1866      header_present = false;
1867   }
1868
1869   if (header_present) {
1870      /* m0, m1 header */
1871      nr += 2;
1872   }
1873
1874   if (c->key.aa_dest_stencil_reg) {
1875      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1876		   fs_reg(brw_vec8_grf(c->key.aa_dest_stencil_reg, 0))));
1877   }
1878
1879   /* Reserve space for color. It'll be filled in per MRT below. */
1880   int color_mrf = nr;
1881   nr += 4;
1882
1883   if (c->key.source_depth_to_render_target) {
1884      if (c->key.computes_depth) {
1885	 /* Hand over gl_FragDepth. */
1886	 assert(this->frag_depth);
1887	 fs_reg depth = *(variable_storage(this->frag_depth));
1888
1889	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth));
1890      } else {
1891	 /* Pass through the payload depth. */
1892	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1893		      fs_reg(brw_vec8_grf(c->key.source_depth_reg, 0))));
1894      }
1895   }
1896
1897   if (c->key.dest_depth_reg) {
1898      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
1899		   fs_reg(brw_vec8_grf(c->key.dest_depth_reg, 0))));
1900   }
1901
1902   fs_reg color = reg_undef;
1903   if (this->frag_color)
1904      color = *(variable_storage(this->frag_color));
1905   else if (this->frag_data)
1906      color = *(variable_storage(this->frag_data));
1907
1908   for (int target = 0; target < c->key.nr_color_regions; target++) {
1909      this->current_annotation = talloc_asprintf(this->mem_ctx,
1910						 "FB write target %d",
1911						 target);
1912      if (this->frag_color || this->frag_data) {
1913	 for (int i = 0; i < 4; i++) {
1914	    emit(fs_inst(BRW_OPCODE_MOV,
1915			 fs_reg(MRF, color_mrf + i),
1916			 color));
1917	    color.reg_offset++;
1918	 }
1919      }
1920
1921      if (this->frag_color)
1922	 color.reg_offset -= 4;
1923
1924      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
1925				   reg_undef, reg_undef));
1926      inst->target = target;
1927      inst->base_mrf = 0;
1928      inst->mlen = nr;
1929      if (target == c->key.nr_color_regions - 1)
1930	 inst->eot = true;
1931      inst->header_present = header_present;
1932   }
1933
1934   if (c->key.nr_color_regions == 0) {
1935      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
1936				   reg_undef, reg_undef));
1937      inst->base_mrf = 0;
1938      inst->mlen = nr;
1939      inst->eot = true;
1940      inst->header_present = header_present;
1941   }
1942
1943   this->current_annotation = NULL;
1944}
1945
1946void
1947fs_visitor::generate_fb_write(fs_inst *inst)
1948{
1949   GLboolean eot = inst->eot;
1950   struct brw_reg implied_header;
1951
1952   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
1953    * move, here's g1.
1954    */
1955   brw_push_insn_state(p);
1956   brw_set_mask_control(p, BRW_MASK_DISABLE);
1957   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1958
1959   if (inst->header_present) {
1960      if (intel->gen >= 6) {
1961	 brw_MOV(p,
1962		 brw_message_reg(inst->base_mrf),
1963		 brw_vec8_grf(0, 0));
1964
1965	 if (inst->target > 0) {
1966	    /* Set the render target index for choosing BLEND_STATE. */
1967	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
1968			      BRW_REGISTER_TYPE_UD),
1969		    brw_imm_ud(inst->target));
1970	 }
1971
1972	 /* Clear viewport index, render target array index. */
1973	 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
1974			   BRW_REGISTER_TYPE_UD),
1975		 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
1976		 brw_imm_ud(0xf7ff));
1977
1978	 implied_header = brw_null_reg();
1979      } else {
1980	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
1981      }
1982
1983      brw_MOV(p,
1984	      brw_message_reg(inst->base_mrf + 1),
1985	      brw_vec8_grf(1, 0));
1986   } else {
1987      implied_header = brw_null_reg();
1988   }
1989
1990   brw_pop_insn_state(p);
1991
1992   brw_fb_WRITE(p,
1993		8, /* dispatch_width */
1994		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
1995		inst->base_mrf,
1996		implied_header,
1997		inst->target,
1998		inst->mlen,
1999		0,
2000		eot);
2001}
2002
2003void
2004fs_visitor::generate_linterp(fs_inst *inst,
2005			     struct brw_reg dst, struct brw_reg *src)
2006{
2007   struct brw_reg delta_x = src[0];
2008   struct brw_reg delta_y = src[1];
2009   struct brw_reg interp = src[2];
2010
2011   if (brw->has_pln &&
2012       delta_y.nr == delta_x.nr + 1 &&
2013       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2014      brw_PLN(p, dst, interp, delta_x);
2015   } else {
2016      brw_LINE(p, brw_null_reg(), interp, delta_x);
2017      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2018   }
2019}
2020
2021void
2022fs_visitor::generate_math(fs_inst *inst,
2023			  struct brw_reg dst, struct brw_reg *src)
2024{
2025   int op;
2026
2027   switch (inst->opcode) {
2028   case FS_OPCODE_RCP:
2029      op = BRW_MATH_FUNCTION_INV;
2030      break;
2031   case FS_OPCODE_RSQ:
2032      op = BRW_MATH_FUNCTION_RSQ;
2033      break;
2034   case FS_OPCODE_SQRT:
2035      op = BRW_MATH_FUNCTION_SQRT;
2036      break;
2037   case FS_OPCODE_EXP2:
2038      op = BRW_MATH_FUNCTION_EXP;
2039      break;
2040   case FS_OPCODE_LOG2:
2041      op = BRW_MATH_FUNCTION_LOG;
2042      break;
2043   case FS_OPCODE_POW:
2044      op = BRW_MATH_FUNCTION_POW;
2045      break;
2046   case FS_OPCODE_SIN:
2047      op = BRW_MATH_FUNCTION_SIN;
2048      break;
2049   case FS_OPCODE_COS:
2050      op = BRW_MATH_FUNCTION_COS;
2051      break;
2052   default:
2053      assert(!"not reached: unknown math function");
2054      op = 0;
2055      break;
2056   }
2057
2058   if (intel->gen >= 6) {
2059      assert(inst->mlen == 0);
2060
2061      if (inst->opcode == FS_OPCODE_POW) {
2062	 brw_math2(p, dst, op, src[0], src[1]);
2063      } else {
2064	 brw_math(p, dst,
2065		  op,
2066		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2067		  BRW_MATH_SATURATE_NONE,
2068		  0, src[0],
2069		  BRW_MATH_DATA_VECTOR,
2070		  BRW_MATH_PRECISION_FULL);
2071      }
2072   } else {
2073      assert(inst->mlen >= 1);
2074
2075      brw_math(p, dst,
2076	       op,
2077	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2078	       BRW_MATH_SATURATE_NONE,
2079	       inst->base_mrf, src[0],
2080	       BRW_MATH_DATA_VECTOR,
2081	       BRW_MATH_PRECISION_FULL);
2082   }
2083}
2084
2085void
2086fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst)
2087{
2088   int msg_type = -1;
2089   int rlen = 4;
2090   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2091
2092   if (intel->gen >= 5) {
2093      switch (inst->opcode) {
2094      case FS_OPCODE_TEX:
2095	 if (inst->shadow_compare) {
2096	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
2097	 } else {
2098	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
2099	 }
2100	 break;
2101      case FS_OPCODE_TXB:
2102	 if (inst->shadow_compare) {
2103	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE_GEN5;
2104	 } else {
2105	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
2106	 }
2107	 break;
2108      }
2109   } else {
2110      switch (inst->opcode) {
2111      case FS_OPCODE_TEX:
2112	 /* Note that G45 and older determines shadow compare and dispatch width
2113	  * from message length for most messages.
2114	  */
2115	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2116	 if (inst->shadow_compare) {
2117	    assert(inst->mlen == 6);
2118	 } else {
2119	    assert(inst->mlen <= 4);
2120	 }
2121	 break;
2122      case FS_OPCODE_TXB:
2123	 if (inst->shadow_compare) {
2124	    assert(inst->mlen == 6);
2125	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2126	 } else {
2127	    assert(inst->mlen == 9);
2128	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2129	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2130	 }
2131	 break;
2132      }
2133   }
2134   assert(msg_type != -1);
2135
2136   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2137      rlen = 8;
2138      dst = vec16(dst);
2139   }
2140
2141   brw_SAMPLE(p,
2142	      retype(dst, BRW_REGISTER_TYPE_UW),
2143	      inst->base_mrf,
2144	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
2145              SURF_INDEX_TEXTURE(inst->sampler),
2146	      inst->sampler,
2147	      WRITEMASK_XYZW,
2148	      msg_type,
2149	      rlen,
2150	      inst->mlen,
2151	      0,
2152	      1,
2153	      simd_mode);
2154}
2155
2156
2157/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2158 * looking like:
2159 *
2160 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2161 *
2162 * and we're trying to produce:
2163 *
2164 *           DDX                     DDY
2165 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2166 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2167 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2168 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2169 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2170 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2171 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2172 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2173 *
2174 * and add another set of two more subspans if in 16-pixel dispatch mode.
2175 *
2176 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2177 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2178 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2179 * between each other.  We could probably do it like ddx and swizzle the right
2180 * order later, but bail for now and just produce
2181 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2182 */
2183void
2184fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2185{
2186   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2187				 BRW_REGISTER_TYPE_F,
2188				 BRW_VERTICAL_STRIDE_2,
2189				 BRW_WIDTH_2,
2190				 BRW_HORIZONTAL_STRIDE_0,
2191				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2192   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2193				 BRW_REGISTER_TYPE_F,
2194				 BRW_VERTICAL_STRIDE_2,
2195				 BRW_WIDTH_2,
2196				 BRW_HORIZONTAL_STRIDE_0,
2197				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2198   brw_ADD(p, dst, src0, negate(src1));
2199}
2200
2201void
2202fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2203{
2204   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2205				 BRW_REGISTER_TYPE_F,
2206				 BRW_VERTICAL_STRIDE_4,
2207				 BRW_WIDTH_4,
2208				 BRW_HORIZONTAL_STRIDE_0,
2209				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2210   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2211				 BRW_REGISTER_TYPE_F,
2212				 BRW_VERTICAL_STRIDE_4,
2213				 BRW_WIDTH_4,
2214				 BRW_HORIZONTAL_STRIDE_0,
2215				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2216   brw_ADD(p, dst, src0, negate(src1));
2217}
2218
2219void
2220fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2221{
2222   if (intel->gen >= 6) {
2223      /* Gen6 no longer has the mask reg for us to just read the
2224       * active channels from.  However, cmp updates just the channels
2225       * of the flag reg that are enabled, so we can get at the
2226       * channel enables that way.  In this step, make a reg of ones
2227       * we'll compare to.
2228       */
2229      brw_MOV(p, mask, brw_imm_ud(1));
2230   } else {
2231      brw_push_insn_state(p);
2232      brw_set_mask_control(p, BRW_MASK_DISABLE);
2233      brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2234      brw_pop_insn_state(p);
2235   }
2236}
2237
2238void
2239fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2240{
2241   if (intel->gen >= 6) {
2242      struct brw_reg f0 = brw_flag_reg();
2243      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2244
2245      brw_push_insn_state(p);
2246      brw_set_mask_control(p, BRW_MASK_DISABLE);
2247      brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2248      brw_pop_insn_state(p);
2249
2250      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2251	      BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2252      /* Undo CMP's whacking of predication*/
2253      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2254
2255      brw_push_insn_state(p);
2256      brw_set_mask_control(p, BRW_MASK_DISABLE);
2257      brw_AND(p, g1, f0, g1);
2258      brw_pop_insn_state(p);
2259   } else {
2260      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2261
2262      mask = brw_uw1_reg(mask.file, mask.nr, 0);
2263
2264      brw_push_insn_state(p);
2265      brw_set_mask_control(p, BRW_MASK_DISABLE);
2266      brw_AND(p, g0, mask, g0);
2267      brw_pop_insn_state(p);
2268   }
2269}
2270
2271void
2272fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2273{
2274   assert(inst->mlen != 0);
2275
2276   brw_MOV(p,
2277	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2278	   retype(src, BRW_REGISTER_TYPE_UD));
2279   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2280				 inst->offset);
2281}
2282
2283void
2284fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2285{
2286   assert(inst->mlen != 0);
2287
2288   /* Clear any post destination dependencies that would be ignored by
2289    * the block read.  See the B-Spec for pre-gen5 send instruction.
2290    *
2291    * This could use a better solution, since texture sampling and
2292    * math reads could potentially run into it as well -- anywhere
2293    * that we have a SEND with a destination that is a register that
2294    * was written but not read within the last N instructions (what's
2295    * N?  unsure).  This is rare because of dead code elimination, but
2296    * not impossible.
2297    */
2298   if (intel->gen == 4 && !intel->is_g4x)
2299      brw_MOV(p, brw_null_reg(), dst);
2300
2301   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2302				inst->offset);
2303
2304   if (intel->gen == 4 && !intel->is_g4x) {
2305      /* gen4 errata: destination from a send can't be used as a
2306       * destination until it's been read.  Just read it so we don't
2307       * have to worry.
2308       */
2309      brw_MOV(p, brw_null_reg(), dst);
2310   }
2311}
2312
2313
2314void
2315fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2316{
2317   assert(inst->mlen != 0);
2318
2319   /* Clear any post destination dependencies that would be ignored by
2320    * the block read.  See the B-Spec for pre-gen5 send instruction.
2321    *
2322    * This could use a better solution, since texture sampling and
2323    * math reads could potentially run into it as well -- anywhere
2324    * that we have a SEND with a destination that is a register that
2325    * was written but not read within the last N instructions (what's
2326    * N?  unsure).  This is rare because of dead code elimination, but
2327    * not impossible.
2328    */
2329   if (intel->gen == 4 && !intel->is_g4x)
2330      brw_MOV(p, brw_null_reg(), dst);
2331
2332   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2333			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2334
2335   if (intel->gen == 4 && !intel->is_g4x) {
2336      /* gen4 errata: destination from a send can't be used as a
2337       * destination until it's been read.  Just read it so we don't
2338       * have to worry.
2339       */
2340      brw_MOV(p, brw_null_reg(), dst);
2341   }
2342}
2343
2344void
2345fs_visitor::assign_curb_setup()
2346{
2347   c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
2348   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2349
2350   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2351   foreach_iter(exec_list_iterator, iter, this->instructions) {
2352      fs_inst *inst = (fs_inst *)iter.get();
2353
2354      for (unsigned int i = 0; i < 3; i++) {
2355	 if (inst->src[i].file == UNIFORM) {
2356	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2357	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2358						  constant_nr / 8,
2359						  constant_nr % 8);
2360
2361	    inst->src[i].file = FIXED_HW_REG;
2362	    inst->src[i].fixed_hw_reg = brw_reg;
2363	 }
2364      }
2365   }
2366}
2367
2368void
2369fs_visitor::calculate_urb_setup()
2370{
2371   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2372      urb_setup[i] = -1;
2373   }
2374
2375   int urb_next = 0;
2376   /* Figure out where each of the incoming setup attributes lands. */
2377   if (intel->gen >= 6) {
2378      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2379	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2380	    urb_setup[i] = urb_next++;
2381	 }
2382      }
2383   } else {
2384      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2385      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2386	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2387	    int fp_index;
2388
2389	    if (i >= VERT_RESULT_VAR0)
2390	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2391	    else if (i <= VERT_RESULT_TEX7)
2392	       fp_index = i;
2393	    else
2394	       fp_index = -1;
2395
2396	    if (fp_index >= 0)
2397	       urb_setup[fp_index] = urb_next++;
2398	 }
2399      }
2400   }
2401
2402   /* Each attribute is 4 setup channels, each of which is half a reg. */
2403   c->prog_data.urb_read_length = urb_next * 2;
2404}
2405
2406void
2407fs_visitor::assign_urb_setup()
2408{
2409   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2410
2411   /* Offset all the urb_setup[] index by the actual position of the
2412    * setup regs, now that the location of the constants has been chosen.
2413    */
2414   foreach_iter(exec_list_iterator, iter, this->instructions) {
2415      fs_inst *inst = (fs_inst *)iter.get();
2416
2417      if (inst->opcode != FS_OPCODE_LINTERP)
2418	 continue;
2419
2420      assert(inst->src[2].file == FIXED_HW_REG);
2421
2422      inst->src[2].fixed_hw_reg.nr += urb_start;
2423   }
2424
2425   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2426}
2427
2428/**
2429 * Split large virtual GRFs into separate components if we can.
2430 *
2431 * This is mostly duplicated with what brw_fs_vector_splitting does,
2432 * but that's really conservative because it's afraid of doing
2433 * splitting that doesn't result in real progress after the rest of
2434 * the optimization phases, which would cause infinite looping in
2435 * optimization.  We can do it once here, safely.  This also has the
2436 * opportunity to split interpolated values, or maybe even uniforms,
2437 * which we don't have at the IR level.
2438 *
2439 * We want to split, because virtual GRFs are what we register
2440 * allocate and spill (due to contiguousness requirements for some
2441 * instructions), and they're what we naturally generate in the
2442 * codegen process, but most virtual GRFs don't actually need to be
2443 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2444 * live intervals and better dead code elimination and coalescing.
2445 */
2446void
2447fs_visitor::split_virtual_grfs()
2448{
2449   int num_vars = this->virtual_grf_next;
2450   bool split_grf[num_vars];
2451   int new_virtual_grf[num_vars];
2452
2453   /* Try to split anything > 0 sized. */
2454   for (int i = 0; i < num_vars; i++) {
2455      if (this->virtual_grf_sizes[i] != 1)
2456	 split_grf[i] = true;
2457      else
2458	 split_grf[i] = false;
2459   }
2460
2461   if (brw->has_pln) {
2462      /* PLN opcodes rely on the delta_xy being contiguous. */
2463      split_grf[this->delta_x.reg] = false;
2464   }
2465
2466   foreach_iter(exec_list_iterator, iter, this->instructions) {
2467      fs_inst *inst = (fs_inst *)iter.get();
2468
2469      /* Texturing produces 4 contiguous registers, so no splitting. */
2470      if ((inst->opcode == FS_OPCODE_TEX ||
2471	   inst->opcode == FS_OPCODE_TXB ||
2472	   inst->opcode == FS_OPCODE_TXL) &&
2473	  inst->dst.file == GRF) {
2474	 split_grf[inst->dst.reg] = false;
2475      }
2476   }
2477
2478   /* Allocate new space for split regs.  Note that the virtual
2479    * numbers will be contiguous.
2480    */
2481   for (int i = 0; i < num_vars; i++) {
2482      if (split_grf[i]) {
2483	 new_virtual_grf[i] = virtual_grf_alloc(1);
2484	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2485	    int reg = virtual_grf_alloc(1);
2486	    assert(reg == new_virtual_grf[i] + j - 1);
2487	 }
2488	 this->virtual_grf_sizes[i] = 1;
2489      }
2490   }
2491
2492   foreach_iter(exec_list_iterator, iter, this->instructions) {
2493      fs_inst *inst = (fs_inst *)iter.get();
2494
2495      if (inst->dst.file == GRF &&
2496	  split_grf[inst->dst.reg] &&
2497	  inst->dst.reg_offset != 0) {
2498	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2499			  inst->dst.reg_offset - 1);
2500	 inst->dst.reg_offset = 0;
2501      }
2502      for (int i = 0; i < 3; i++) {
2503	 if (inst->src[i].file == GRF &&
2504	     split_grf[inst->src[i].reg] &&
2505	     inst->src[i].reg_offset != 0) {
2506	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2507				inst->src[i].reg_offset - 1);
2508	    inst->src[i].reg_offset = 0;
2509	 }
2510      }
2511   }
2512}
2513
2514/**
2515 * Choose accesses from the UNIFORM file to demote to using the pull
2516 * constant buffer.
2517 *
2518 * We allow a fragment shader to have more than the specified minimum
2519 * maximum number of fragment shader uniform components (64).  If
2520 * there are too many of these, they'd fill up all of register space.
2521 * So, this will push some of them out to the pull constant buffer and
2522 * update the program to load them.
2523 */
2524void
2525fs_visitor::setup_pull_constants()
2526{
2527   /* Only allow 16 registers (128 uniform components) as push constants. */
2528   unsigned int max_uniform_components = 16 * 8;
2529   if (c->prog_data.nr_params <= max_uniform_components)
2530      return;
2531
2532   /* Just demote the end of the list.  We could probably do better
2533    * here, demoting things that are rarely used in the program first.
2534    */
2535   int pull_uniform_base = max_uniform_components;
2536   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2537
2538   foreach_iter(exec_list_iterator, iter, this->instructions) {
2539      fs_inst *inst = (fs_inst *)iter.get();
2540
2541      for (int i = 0; i < 3; i++) {
2542	 if (inst->src[i].file != UNIFORM)
2543	    continue;
2544
2545	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2546	 if (uniform_nr < pull_uniform_base)
2547	    continue;
2548
2549	 fs_reg dst = fs_reg(this, glsl_type::float_type);
2550	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2551					      dst);
2552	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2553	 pull->ir = inst->ir;
2554	 pull->annotation = inst->annotation;
2555	 pull->base_mrf = 14;
2556	 pull->mlen = 1;
2557
2558	 inst->insert_before(pull);
2559
2560	 inst->src[i].file = GRF;
2561	 inst->src[i].reg = dst.reg;
2562	 inst->src[i].reg_offset = 0;
2563	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2564      }
2565   }
2566
2567   for (int i = 0; i < pull_uniform_count; i++) {
2568      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2569   }
2570   c->prog_data.nr_params -= pull_uniform_count;
2571   c->prog_data.nr_pull_params = pull_uniform_count;
2572}
2573
2574void
2575fs_visitor::calculate_live_intervals()
2576{
2577   int num_vars = this->virtual_grf_next;
2578   int *def = talloc_array(mem_ctx, int, num_vars);
2579   int *use = talloc_array(mem_ctx, int, num_vars);
2580   int loop_depth = 0;
2581   int loop_start = 0;
2582   int bb_header_ip = 0;
2583
2584   for (int i = 0; i < num_vars; i++) {
2585      def[i] = 1 << 30;
2586      use[i] = -1;
2587   }
2588
2589   int ip = 0;
2590   foreach_iter(exec_list_iterator, iter, this->instructions) {
2591      fs_inst *inst = (fs_inst *)iter.get();
2592
2593      if (inst->opcode == BRW_OPCODE_DO) {
2594	 if (loop_depth++ == 0)
2595	    loop_start = ip;
2596      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2597	 loop_depth--;
2598
2599	 if (loop_depth == 0) {
2600	    /* Patches up the use of vars marked for being live across
2601	     * the whole loop.
2602	     */
2603	    for (int i = 0; i < num_vars; i++) {
2604	       if (use[i] == loop_start) {
2605		  use[i] = ip;
2606	       }
2607	    }
2608	 }
2609      } else {
2610	 for (unsigned int i = 0; i < 3; i++) {
2611	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2612	       int reg = inst->src[i].reg;
2613
2614	       if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2615				   def[reg] >= bb_header_ip)) {
2616		  use[reg] = ip;
2617	       } else {
2618		  def[reg] = MIN2(loop_start, def[reg]);
2619		  use[reg] = loop_start;
2620
2621		  /* Nobody else is going to go smash our start to
2622		   * later in the loop now, because def[reg] now
2623		   * points before the bb header.
2624		   */
2625	       }
2626	    }
2627	 }
2628	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2629	    int reg = inst->dst.reg;
2630
2631	    if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2632				!inst->predicated)) {
2633	       def[reg] = MIN2(def[reg], ip);
2634	    } else {
2635	       def[reg] = MIN2(def[reg], loop_start);
2636	    }
2637	 }
2638      }
2639
2640      ip++;
2641
2642      /* Set the basic block header IP.  This is used for determining
2643       * if a complete def of single-register virtual GRF in a loop
2644       * dominates a use in the same basic block.  It's a quick way to
2645       * reduce the live interval range of most register used in a
2646       * loop.
2647       */
2648      if (inst->opcode == BRW_OPCODE_IF ||
2649	  inst->opcode == BRW_OPCODE_ELSE ||
2650	  inst->opcode == BRW_OPCODE_ENDIF ||
2651	  inst->opcode == BRW_OPCODE_DO ||
2652	  inst->opcode == BRW_OPCODE_WHILE ||
2653	  inst->opcode == BRW_OPCODE_BREAK ||
2654	  inst->opcode == BRW_OPCODE_CONTINUE) {
2655	 bb_header_ip = ip;
2656      }
2657   }
2658
2659   talloc_free(this->virtual_grf_def);
2660   talloc_free(this->virtual_grf_use);
2661   this->virtual_grf_def = def;
2662   this->virtual_grf_use = use;
2663}
2664
2665/**
2666 * Attempts to move immediate constants into the immediate
2667 * constant slot of following instructions.
2668 *
2669 * Immediate constants are a bit tricky -- they have to be in the last
2670 * operand slot, you can't do abs/negate on them,
2671 */
2672
2673bool
2674fs_visitor::propagate_constants()
2675{
2676   bool progress = false;
2677
2678   foreach_iter(exec_list_iterator, iter, this->instructions) {
2679      fs_inst *inst = (fs_inst *)iter.get();
2680
2681      if (inst->opcode != BRW_OPCODE_MOV ||
2682	  inst->predicated ||
2683	  inst->dst.file != GRF || inst->src[0].file != IMM ||
2684	  inst->dst.type != inst->src[0].type)
2685	 continue;
2686
2687      /* Don't bother with cases where we should have had the
2688       * operation on the constant folded in GLSL already.
2689       */
2690      if (inst->saturate)
2691	 continue;
2692
2693      /* Found a move of a constant to a GRF.  Find anything else using the GRF
2694       * before it's written, and replace it with the constant if we can.
2695       */
2696      exec_list_iterator scan_iter = iter;
2697      scan_iter.next();
2698      for (; scan_iter.has_next(); scan_iter.next()) {
2699	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2700
2701	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2702	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2703	     scan_inst->opcode == BRW_OPCODE_ELSE ||
2704	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2705	    break;
2706	 }
2707
2708	 for (int i = 2; i >= 0; i--) {
2709	    if (scan_inst->src[i].file != GRF ||
2710		scan_inst->src[i].reg != inst->dst.reg ||
2711		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2712	       continue;
2713
2714	    /* Don't bother with cases where we should have had the
2715	     * operation on the constant folded in GLSL already.
2716	     */
2717	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2718	       continue;
2719
2720	    switch (scan_inst->opcode) {
2721	    case BRW_OPCODE_MOV:
2722	       scan_inst->src[i] = inst->src[0];
2723	       progress = true;
2724	       break;
2725
2726	    case BRW_OPCODE_MUL:
2727	    case BRW_OPCODE_ADD:
2728	       if (i == 1) {
2729		  scan_inst->src[i] = inst->src[0];
2730		  progress = true;
2731	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
2732		  /* Fit this constant in by commuting the operands */
2733		  scan_inst->src[0] = scan_inst->src[1];
2734		  scan_inst->src[1] = inst->src[0];
2735	       }
2736	       break;
2737	    case BRW_OPCODE_CMP:
2738	       if (i == 1) {
2739		  scan_inst->src[i] = inst->src[0];
2740		  progress = true;
2741	       }
2742	    }
2743	 }
2744
2745	 if (scan_inst->dst.file == GRF &&
2746	     scan_inst->dst.reg == inst->dst.reg &&
2747	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2748	      scan_inst->opcode == FS_OPCODE_TEX)) {
2749	    break;
2750	 }
2751      }
2752   }
2753
2754   return progress;
2755}
2756/**
2757 * Must be called after calculate_live_intervales() to remove unused
2758 * writes to registers -- register allocation will fail otherwise
2759 * because something deffed but not used won't be considered to
2760 * interfere with other regs.
2761 */
2762bool
2763fs_visitor::dead_code_eliminate()
2764{
2765   bool progress = false;
2766   int num_vars = this->virtual_grf_next;
2767   bool dead[num_vars];
2768
2769   for (int i = 0; i < num_vars; i++) {
2770      dead[i] = this->virtual_grf_def[i] >= this->virtual_grf_use[i];
2771
2772      if (dead[i]) {
2773	 /* Mark off its interval so it won't interfere with anything. */
2774	 this->virtual_grf_def[i] = -1;
2775	 this->virtual_grf_use[i] = -1;
2776      }
2777   }
2778
2779   foreach_iter(exec_list_iterator, iter, this->instructions) {
2780      fs_inst *inst = (fs_inst *)iter.get();
2781
2782      if (inst->dst.file == GRF && dead[inst->dst.reg]) {
2783	 inst->remove();
2784	 progress = true;
2785      }
2786   }
2787
2788   return progress;
2789}
2790
2791bool
2792fs_visitor::register_coalesce()
2793{
2794   bool progress = false;
2795
2796   foreach_iter(exec_list_iterator, iter, this->instructions) {
2797      fs_inst *inst = (fs_inst *)iter.get();
2798
2799      if (inst->opcode != BRW_OPCODE_MOV ||
2800	  inst->predicated ||
2801	  inst->saturate ||
2802	  inst->dst.file != GRF || inst->src[0].file != GRF ||
2803	  inst->dst.type != inst->src[0].type)
2804	 continue;
2805
2806      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2807       * them: check for no writes to either one until the exit of the
2808       * program.
2809       */
2810      bool interfered = false;
2811      exec_list_iterator scan_iter = iter;
2812      scan_iter.next();
2813      for (; scan_iter.has_next(); scan_iter.next()) {
2814	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2815
2816	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2817	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2818	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2819	    interfered = true;
2820	    iter = scan_iter;
2821	    break;
2822	 }
2823
2824	 if (scan_inst->dst.file == GRF) {
2825	    if (scan_inst->dst.reg == inst->dst.reg &&
2826		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2827		 scan_inst->opcode == FS_OPCODE_TEX)) {
2828	       interfered = true;
2829	       break;
2830	    }
2831	    if (scan_inst->dst.reg == inst->src[0].reg &&
2832		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
2833		 scan_inst->opcode == FS_OPCODE_TEX)) {
2834	       interfered = true;
2835	       break;
2836	    }
2837	 }
2838      }
2839      if (interfered) {
2840	 continue;
2841      }
2842
2843      /* Update live interval so we don't have to recalculate. */
2844      this->virtual_grf_use[inst->src[0].reg] = MAX2(virtual_grf_use[inst->src[0].reg],
2845						     virtual_grf_use[inst->dst.reg]);
2846
2847      /* Rewrite the later usage to point at the source of the move to
2848       * be removed.
2849       */
2850      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
2851	   scan_iter.next()) {
2852	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2853
2854	 for (int i = 0; i < 3; i++) {
2855	    if (scan_inst->src[i].file == GRF &&
2856		scan_inst->src[i].reg == inst->dst.reg &&
2857		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2858	       scan_inst->src[i].reg = inst->src[0].reg;
2859	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
2860	       scan_inst->src[i].abs |= inst->src[0].abs;
2861	       scan_inst->src[i].negate ^= inst->src[0].negate;
2862	       scan_inst->src[i].smear = inst->src[0].smear;
2863	    }
2864	 }
2865      }
2866
2867      inst->remove();
2868      progress = true;
2869   }
2870
2871   return progress;
2872}
2873
2874
2875bool
2876fs_visitor::compute_to_mrf()
2877{
2878   bool progress = false;
2879   int next_ip = 0;
2880
2881   foreach_iter(exec_list_iterator, iter, this->instructions) {
2882      fs_inst *inst = (fs_inst *)iter.get();
2883
2884      int ip = next_ip;
2885      next_ip++;
2886
2887      if (inst->opcode != BRW_OPCODE_MOV ||
2888	  inst->predicated ||
2889	  inst->dst.file != MRF || inst->src[0].file != GRF ||
2890	  inst->dst.type != inst->src[0].type ||
2891	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2892	 continue;
2893
2894      /* Can't compute-to-MRF this GRF if someone else was going to
2895       * read it later.
2896       */
2897      if (this->virtual_grf_use[inst->src[0].reg] > ip)
2898	 continue;
2899
2900      /* Found a move of a GRF to a MRF.  Let's see if we can go
2901       * rewrite the thing that made this GRF to write into the MRF.
2902       */
2903      bool found = false;
2904      fs_inst *scan_inst;
2905      for (scan_inst = (fs_inst *)inst->prev;
2906	   scan_inst->prev != NULL;
2907	   scan_inst = (fs_inst *)scan_inst->prev) {
2908	 /* We don't handle flow control here.  Most computation of
2909	  * values that end up in MRFs are shortly before the MRF
2910	  * write anyway.
2911	  */
2912	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2913	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2914	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2915	    break;
2916	 }
2917
2918	 /* You can't read from an MRF, so if someone else reads our
2919	  * MRF's source GRF that we wanted to rewrite, that stops us.
2920	  */
2921	 bool interfered = false;
2922	 for (int i = 0; i < 3; i++) {
2923	    if (scan_inst->src[i].file == GRF &&
2924		scan_inst->src[i].reg == inst->src[0].reg &&
2925		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2926	       interfered = true;
2927	    }
2928	 }
2929	 if (interfered)
2930	    break;
2931
2932	 if (scan_inst->dst.file == MRF &&
2933	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
2934	    /* Somebody else wrote our MRF here, so we can't can't
2935	     * compute-to-MRF before that.
2936	     */
2937	    break;
2938	 }
2939
2940	 if (scan_inst->mlen > 0) {
2941	    /* Found a SEND instruction, which will do some amount of
2942	     * implied write that may overwrite our MRF that we were
2943	     * hoping to compute-to-MRF somewhere above it.  Nothing
2944	     * we have implied-writes more than 2 MRFs from base_mrf,
2945	     * though.
2946	     */
2947	    int implied_write_len = MIN2(scan_inst->mlen, 2);
2948	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
2949		inst->dst.hw_reg < scan_inst->base_mrf + implied_write_len) {
2950	       break;
2951	    }
2952	 }
2953
2954	 if (scan_inst->dst.file == GRF &&
2955	     scan_inst->dst.reg == inst->src[0].reg) {
2956	    /* Found the last thing to write our reg we want to turn
2957	     * into a compute-to-MRF.
2958	     */
2959
2960	    if (scan_inst->opcode == FS_OPCODE_TEX) {
2961	       /* texturing writes several continuous regs, so we can't
2962		* compute-to-mrf that.
2963		*/
2964	       break;
2965	    }
2966
2967	    /* If it's predicated, it (probably) didn't populate all
2968	     * the channels.
2969	     */
2970	    if (scan_inst->predicated)
2971	       break;
2972
2973	    /* SEND instructions can't have MRF as a destination. */
2974	    if (scan_inst->mlen)
2975	       break;
2976
2977	    if (intel->gen >= 6) {
2978	       /* gen6 math instructions must have the destination be
2979		* GRF, so no compute-to-MRF for them.
2980		*/
2981	       if (scan_inst->opcode == FS_OPCODE_RCP ||
2982		   scan_inst->opcode == FS_OPCODE_RSQ ||
2983		   scan_inst->opcode == FS_OPCODE_SQRT ||
2984		   scan_inst->opcode == FS_OPCODE_EXP2 ||
2985		   scan_inst->opcode == FS_OPCODE_LOG2 ||
2986		   scan_inst->opcode == FS_OPCODE_SIN ||
2987		   scan_inst->opcode == FS_OPCODE_COS ||
2988		   scan_inst->opcode == FS_OPCODE_POW) {
2989		  break;
2990	       }
2991	    }
2992
2993	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2994	       /* Found the creator of our MRF's source value. */
2995	       found = true;
2996	       break;
2997	    }
2998	 }
2999      }
3000      if (found) {
3001	 scan_inst->dst.file = MRF;
3002	 scan_inst->dst.hw_reg = inst->dst.hw_reg;
3003	 scan_inst->saturate |= inst->saturate;
3004	 inst->remove();
3005	 progress = true;
3006      }
3007   }
3008
3009   return progress;
3010}
3011
3012bool
3013fs_visitor::virtual_grf_interferes(int a, int b)
3014{
3015   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3016   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3017
3018   /* For dead code, just check if the def interferes with the other range. */
3019   if (this->virtual_grf_use[a] == -1) {
3020      return (this->virtual_grf_def[a] >= this->virtual_grf_def[b] &&
3021	      this->virtual_grf_def[a] < this->virtual_grf_use[b]);
3022   }
3023   if (this->virtual_grf_use[b] == -1) {
3024      return (this->virtual_grf_def[b] >= this->virtual_grf_def[a] &&
3025	      this->virtual_grf_def[b] < this->virtual_grf_use[a]);
3026   }
3027
3028   return start < end;
3029}
3030
3031static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3032{
3033   struct brw_reg brw_reg;
3034
3035   switch (reg->file) {
3036   case GRF:
3037   case ARF:
3038   case MRF:
3039      if (reg->smear == -1) {
3040	 brw_reg = brw_vec8_reg(reg->file,
3041				reg->hw_reg, 0);
3042      } else {
3043	 brw_reg = brw_vec1_reg(reg->file,
3044				reg->hw_reg, reg->smear);
3045      }
3046      brw_reg = retype(brw_reg, reg->type);
3047      break;
3048   case IMM:
3049      switch (reg->type) {
3050      case BRW_REGISTER_TYPE_F:
3051	 brw_reg = brw_imm_f(reg->imm.f);
3052	 break;
3053      case BRW_REGISTER_TYPE_D:
3054	 brw_reg = brw_imm_d(reg->imm.i);
3055	 break;
3056      case BRW_REGISTER_TYPE_UD:
3057	 brw_reg = brw_imm_ud(reg->imm.u);
3058	 break;
3059      default:
3060	 assert(!"not reached");
3061	 break;
3062      }
3063      break;
3064   case FIXED_HW_REG:
3065      brw_reg = reg->fixed_hw_reg;
3066      break;
3067   case BAD_FILE:
3068      /* Probably unused. */
3069      brw_reg = brw_null_reg();
3070      break;
3071   case UNIFORM:
3072      assert(!"not reached");
3073      brw_reg = brw_null_reg();
3074      break;
3075   }
3076   if (reg->abs)
3077      brw_reg = brw_abs(brw_reg);
3078   if (reg->negate)
3079      brw_reg = negate(brw_reg);
3080
3081   return brw_reg;
3082}
3083
3084void
3085fs_visitor::generate_code()
3086{
3087   int last_native_inst = 0;
3088   struct brw_instruction *if_stack[16], *loop_stack[16];
3089   int if_stack_depth = 0, loop_stack_depth = 0;
3090   int if_depth_in_loop[16];
3091   const char *last_annotation_string = NULL;
3092   ir_instruction *last_annotation_ir = NULL;
3093
3094   if (INTEL_DEBUG & DEBUG_WM) {
3095      printf("Native code for fragment shader %d:\n",
3096	     ctx->Shader.CurrentProgram->Name);
3097   }
3098
3099   if_depth_in_loop[loop_stack_depth] = 0;
3100
3101   memset(&if_stack, 0, sizeof(if_stack));
3102   foreach_iter(exec_list_iterator, iter, this->instructions) {
3103      fs_inst *inst = (fs_inst *)iter.get();
3104      struct brw_reg src[3], dst;
3105
3106      if (INTEL_DEBUG & DEBUG_WM) {
3107	 if (last_annotation_ir != inst->ir) {
3108	    last_annotation_ir = inst->ir;
3109	    if (last_annotation_ir) {
3110	       printf("   ");
3111	       last_annotation_ir->print();
3112	       printf("\n");
3113	    }
3114	 }
3115	 if (last_annotation_string != inst->annotation) {
3116	    last_annotation_string = inst->annotation;
3117	    if (last_annotation_string)
3118	       printf("   %s\n", last_annotation_string);
3119	 }
3120      }
3121
3122      for (unsigned int i = 0; i < 3; i++) {
3123	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3124      }
3125      dst = brw_reg_from_fs_reg(&inst->dst);
3126
3127      brw_set_conditionalmod(p, inst->conditional_mod);
3128      brw_set_predicate_control(p, inst->predicated);
3129
3130      switch (inst->opcode) {
3131      case BRW_OPCODE_MOV:
3132	 brw_MOV(p, dst, src[0]);
3133	 break;
3134      case BRW_OPCODE_ADD:
3135	 brw_ADD(p, dst, src[0], src[1]);
3136	 break;
3137      case BRW_OPCODE_MUL:
3138	 brw_MUL(p, dst, src[0], src[1]);
3139	 break;
3140
3141      case BRW_OPCODE_FRC:
3142	 brw_FRC(p, dst, src[0]);
3143	 break;
3144      case BRW_OPCODE_RNDD:
3145	 brw_RNDD(p, dst, src[0]);
3146	 break;
3147      case BRW_OPCODE_RNDE:
3148	 brw_RNDE(p, dst, src[0]);
3149	 break;
3150      case BRW_OPCODE_RNDZ:
3151	 brw_RNDZ(p, dst, src[0]);
3152	 break;
3153
3154      case BRW_OPCODE_AND:
3155	 brw_AND(p, dst, src[0], src[1]);
3156	 break;
3157      case BRW_OPCODE_OR:
3158	 brw_OR(p, dst, src[0], src[1]);
3159	 break;
3160      case BRW_OPCODE_XOR:
3161	 brw_XOR(p, dst, src[0], src[1]);
3162	 break;
3163      case BRW_OPCODE_NOT:
3164	 brw_NOT(p, dst, src[0]);
3165	 break;
3166      case BRW_OPCODE_ASR:
3167	 brw_ASR(p, dst, src[0], src[1]);
3168	 break;
3169      case BRW_OPCODE_SHR:
3170	 brw_SHR(p, dst, src[0], src[1]);
3171	 break;
3172      case BRW_OPCODE_SHL:
3173	 brw_SHL(p, dst, src[0], src[1]);
3174	 break;
3175
3176      case BRW_OPCODE_CMP:
3177	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3178	 break;
3179      case BRW_OPCODE_SEL:
3180	 brw_SEL(p, dst, src[0], src[1]);
3181	 break;
3182
3183      case BRW_OPCODE_IF:
3184	 assert(if_stack_depth < 16);
3185	 if (inst->src[0].file != BAD_FILE) {
3186	    assert(intel->gen >= 6);
3187	    if_stack[if_stack_depth] = brw_IF_gen6(p, inst->conditional_mod, src[0], src[1]);
3188	 } else {
3189	    if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3190	 }
3191	 if_depth_in_loop[loop_stack_depth]++;
3192	 if_stack_depth++;
3193	 break;
3194
3195      case BRW_OPCODE_ELSE:
3196	 if_stack[if_stack_depth - 1] =
3197	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
3198	 break;
3199      case BRW_OPCODE_ENDIF:
3200	 if_stack_depth--;
3201	 brw_ENDIF(p , if_stack[if_stack_depth]);
3202	 if_depth_in_loop[loop_stack_depth]--;
3203	 break;
3204
3205      case BRW_OPCODE_DO:
3206	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3207	 if_depth_in_loop[loop_stack_depth] = 0;
3208	 break;
3209
3210      case BRW_OPCODE_BREAK:
3211	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3212	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3213	 break;
3214      case BRW_OPCODE_CONTINUE:
3215	 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3216	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3217	 break;
3218
3219      case BRW_OPCODE_WHILE: {
3220	 struct brw_instruction *inst0, *inst1;
3221	 GLuint br = 1;
3222
3223	 if (intel->gen >= 5)
3224	    br = 2;
3225
3226	 assert(loop_stack_depth > 0);
3227	 loop_stack_depth--;
3228	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3229	 /* patch all the BREAK/CONT instructions from last BGNLOOP */
3230	 while (inst0 > loop_stack[loop_stack_depth]) {
3231	    inst0--;
3232	    if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3233		inst0->bits3.if_else.jump_count == 0) {
3234	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3235	    }
3236	    else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3237		     inst0->bits3.if_else.jump_count == 0) {
3238	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3239	    }
3240	 }
3241      }
3242	 break;
3243
3244      case FS_OPCODE_RCP:
3245      case FS_OPCODE_RSQ:
3246      case FS_OPCODE_SQRT:
3247      case FS_OPCODE_EXP2:
3248      case FS_OPCODE_LOG2:
3249      case FS_OPCODE_POW:
3250      case FS_OPCODE_SIN:
3251      case FS_OPCODE_COS:
3252	 generate_math(inst, dst, src);
3253	 break;
3254      case FS_OPCODE_LINTERP:
3255	 generate_linterp(inst, dst, src);
3256	 break;
3257      case FS_OPCODE_TEX:
3258      case FS_OPCODE_TXB:
3259      case FS_OPCODE_TXL:
3260	 generate_tex(inst, dst);
3261	 break;
3262      case FS_OPCODE_DISCARD_NOT:
3263	 generate_discard_not(inst, dst);
3264	 break;
3265      case FS_OPCODE_DISCARD_AND:
3266	 generate_discard_and(inst, src[0]);
3267	 break;
3268      case FS_OPCODE_DDX:
3269	 generate_ddx(inst, dst, src[0]);
3270	 break;
3271      case FS_OPCODE_DDY:
3272	 generate_ddy(inst, dst, src[0]);
3273	 break;
3274
3275      case FS_OPCODE_SPILL:
3276	 generate_spill(inst, src[0]);
3277	 break;
3278
3279      case FS_OPCODE_UNSPILL:
3280	 generate_unspill(inst, dst);
3281	 break;
3282
3283      case FS_OPCODE_PULL_CONSTANT_LOAD:
3284	 generate_pull_constant_load(inst, dst);
3285	 break;
3286
3287      case FS_OPCODE_FB_WRITE:
3288	 generate_fb_write(inst);
3289	 break;
3290      default:
3291	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3292	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3293			  brw_opcodes[inst->opcode].name);
3294	 } else {
3295	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3296	 }
3297	 this->fail = true;
3298      }
3299
3300      if (INTEL_DEBUG & DEBUG_WM) {
3301	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3302	    if (0) {
3303	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3304		      ((uint32_t *)&p->store[i])[3],
3305		      ((uint32_t *)&p->store[i])[2],
3306		      ((uint32_t *)&p->store[i])[1],
3307		      ((uint32_t *)&p->store[i])[0]);
3308	    }
3309	    brw_disasm(stdout, &p->store[i], intel->gen);
3310	    printf("\n");
3311	 }
3312      }
3313
3314      last_native_inst = p->nr_insn;
3315   }
3316}
3317
3318GLboolean
3319brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3320{
3321   struct intel_context *intel = &brw->intel;
3322   struct gl_context *ctx = &intel->ctx;
3323   struct gl_shader_program *prog = ctx->Shader.CurrentProgram;
3324
3325   if (!prog)
3326      return GL_FALSE;
3327
3328   struct brw_shader *shader =
3329     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3330   if (!shader)
3331      return GL_FALSE;
3332
3333   /* We always use 8-wide mode, at least for now.  For one, flow
3334    * control only works in 8-wide.  Also, when we're fragment shader
3335    * bound, we're almost always under register pressure as well, so
3336    * 8-wide would save us from the performance cliff of spilling
3337    * regs.
3338    */
3339   c->dispatch_width = 8;
3340
3341   if (INTEL_DEBUG & DEBUG_WM) {
3342      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3343      _mesa_print_ir(shader->ir, NULL);
3344      printf("\n");
3345   }
3346
3347   /* Now the main event: Visit the shader IR and generate our FS IR for it.
3348    */
3349   fs_visitor v(c, shader);
3350
3351   if (0) {
3352      v.emit_dummy_fs();
3353   } else {
3354      v.calculate_urb_setup();
3355      if (intel->gen < 6)
3356	 v.emit_interpolation_setup_gen4();
3357      else
3358	 v.emit_interpolation_setup_gen6();
3359
3360      /* Generate FS IR for main().  (the visitor only descends into
3361       * functions called "main").
3362       */
3363      foreach_iter(exec_list_iterator, iter, *shader->ir) {
3364	 ir_instruction *ir = (ir_instruction *)iter.get();
3365	 v.base_ir = ir;
3366	 ir->accept(&v);
3367      }
3368
3369      v.emit_fb_writes();
3370
3371      v.split_virtual_grfs();
3372      v.setup_pull_constants();
3373
3374      v.assign_curb_setup();
3375      v.assign_urb_setup();
3376
3377      bool progress;
3378      do {
3379	 progress = false;
3380	 v.calculate_live_intervals();
3381	 progress = v.propagate_constants() || progress;
3382	 progress = v.register_coalesce() || progress;
3383	 progress = v.compute_to_mrf() || progress;
3384	 progress = v.dead_code_eliminate() || progress;
3385      } while (progress);
3386
3387      if (0) {
3388	 /* Debug of register spilling: Go spill everything. */
3389	 int virtual_grf_count = v.virtual_grf_next;
3390	 for (int i = 1; i < virtual_grf_count; i++) {
3391	    v.spill_reg(i);
3392	 }
3393	 v.calculate_live_intervals();
3394      }
3395
3396      if (0)
3397	 v.assign_regs_trivial();
3398      else {
3399	 while (!v.assign_regs()) {
3400	    if (v.fail)
3401	       break;
3402
3403	    v.calculate_live_intervals();
3404	 }
3405      }
3406   }
3407
3408   if (!v.fail)
3409      v.generate_code();
3410
3411   assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
3412
3413   if (v.fail)
3414      return GL_FALSE;
3415
3416   c->prog_data.total_grf = v.grf_used;
3417
3418   return GL_TRUE;
3419}
3420