brw_fs.cpp revision df4d83dca4618eb7077637865763d3e9ab750d11
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44#include "talloc.h"
45}
46#include "brw_fs.h"
47#include "../glsl/glsl_types.h"
48#include "../glsl/ir_optimization.h"
49#include "../glsl/ir_print_visitor.h"
50
51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53struct gl_shader *
54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55{
56   struct brw_shader *shader;
57
58   shader = talloc_zero(NULL, struct brw_shader);
59   if (shader) {
60      shader->base.Type = type;
61      shader->base.Name = name;
62      _mesa_init_shader(ctx, &shader->base);
63   }
64
65   return &shader->base;
66}
67
68struct gl_shader_program *
69brw_new_shader_program(struct gl_context *ctx, GLuint name)
70{
71   struct brw_shader_program *prog;
72   prog = talloc_zero(NULL, struct brw_shader_program);
73   if (prog) {
74      prog->base.Name = name;
75      _mesa_init_shader_program(ctx, &prog->base);
76   }
77   return &prog->base;
78}
79
80GLboolean
81brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader)
82{
83   if (!_mesa_ir_compile_shader(ctx, shader))
84      return GL_FALSE;
85
86   return GL_TRUE;
87}
88
89GLboolean
90brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
91{
92   struct brw_context *brw = brw_context(ctx);
93   struct intel_context *intel = &brw->intel;
94
95   struct brw_shader *shader =
96      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
97   if (shader != NULL) {
98      void *mem_ctx = talloc_new(NULL);
99      bool progress;
100
101      if (shader->ir)
102	 talloc_free(shader->ir);
103      shader->ir = new(shader) exec_list;
104      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
105
106      do_mat_op_to_vec(shader->ir);
107      lower_instructions(shader->ir,
108			 MOD_TO_FRACT |
109			 DIV_TO_MUL_RCP |
110			 SUB_TO_ADD_NEG |
111			 EXP_TO_EXP2 |
112			 LOG_TO_LOG2);
113
114      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
115       * if-statements need to be flattened.
116       */
117      if (intel->gen < 6)
118	 lower_if_to_cond_assign(shader->ir, 16);
119
120      do_lower_texture_projection(shader->ir);
121      do_vec_index_to_cond_assign(shader->ir);
122      brw_do_cubemap_normalize(shader->ir);
123
124      do {
125	 progress = false;
126
127	 brw_do_channel_expressions(shader->ir);
128	 brw_do_vector_splitting(shader->ir);
129
130	 progress = do_lower_jumps(shader->ir, true, true,
131				   true, /* main return */
132				   false, /* continue */
133				   false /* loops */
134				   ) || progress;
135
136	 progress = do_common_optimization(shader->ir, true, 32) || progress;
137
138	 progress = lower_noise(shader->ir) || progress;
139	 progress =
140	    lower_variable_index_to_cond_assign(shader->ir,
141						GL_TRUE, /* input */
142						GL_TRUE, /* output */
143						GL_TRUE, /* temp */
144						GL_TRUE /* uniform */
145						) || progress;
146	 progress = lower_quadop_vector(shader->ir, false) || progress;
147      } while (progress);
148
149      validate_ir_tree(shader->ir);
150
151      reparent_ir(shader->ir, shader->ir);
152      talloc_free(mem_ctx);
153   }
154
155   if (!_mesa_ir_link_shader(ctx, prog))
156      return GL_FALSE;
157
158   return GL_TRUE;
159}
160
161static int
162type_size(const struct glsl_type *type)
163{
164   unsigned int size, i;
165
166   switch (type->base_type) {
167   case GLSL_TYPE_UINT:
168   case GLSL_TYPE_INT:
169   case GLSL_TYPE_FLOAT:
170   case GLSL_TYPE_BOOL:
171      return type->components();
172   case GLSL_TYPE_ARRAY:
173      return type_size(type->fields.array) * type->length;
174   case GLSL_TYPE_STRUCT:
175      size = 0;
176      for (i = 0; i < type->length; i++) {
177	 size += type_size(type->fields.structure[i].type);
178      }
179      return size;
180   case GLSL_TYPE_SAMPLER:
181      /* Samplers take up no register space, since they're baked in at
182       * link time.
183       */
184      return 0;
185   default:
186      assert(!"not reached");
187      return 0;
188   }
189}
190
191/**
192 * Returns how many MRFs an FS opcode will write over.
193 *
194 * Note that this is not the 0 or 1 implied writes in an actual gen
195 * instruction -- the FS opcodes often generate MOVs in addition.
196 */
197int
198fs_visitor::implied_mrf_writes(fs_inst *inst)
199{
200   if (inst->mlen == 0)
201      return 0;
202
203   switch (inst->opcode) {
204   case FS_OPCODE_RCP:
205   case FS_OPCODE_RSQ:
206   case FS_OPCODE_SQRT:
207   case FS_OPCODE_EXP2:
208   case FS_OPCODE_LOG2:
209   case FS_OPCODE_SIN:
210   case FS_OPCODE_COS:
211      return 1;
212   case FS_OPCODE_POW:
213      return 2;
214   case FS_OPCODE_TEX:
215   case FS_OPCODE_TXB:
216   case FS_OPCODE_TXL:
217      return 1;
218   case FS_OPCODE_FB_WRITE:
219      return 2;
220   case FS_OPCODE_PULL_CONSTANT_LOAD:
221   case FS_OPCODE_UNSPILL:
222      return 1;
223   case FS_OPCODE_SPILL:
224      return 2;
225   default:
226      assert(!"not reached");
227      return inst->mlen;
228   }
229}
230
231int
232fs_visitor::virtual_grf_alloc(int size)
233{
234   if (virtual_grf_array_size <= virtual_grf_next) {
235      if (virtual_grf_array_size == 0)
236	 virtual_grf_array_size = 16;
237      else
238	 virtual_grf_array_size *= 2;
239      virtual_grf_sizes = talloc_realloc(mem_ctx, virtual_grf_sizes,
240					 int, virtual_grf_array_size);
241
242      /* This slot is always unused. */
243      virtual_grf_sizes[0] = 0;
244   }
245   virtual_grf_sizes[virtual_grf_next] = size;
246   return virtual_grf_next++;
247}
248
249/** Fixed HW reg constructor. */
250fs_reg::fs_reg(enum register_file file, int hw_reg)
251{
252   init();
253   this->file = file;
254   this->hw_reg = hw_reg;
255   this->type = BRW_REGISTER_TYPE_F;
256}
257
258/** Fixed HW reg constructor. */
259fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
260{
261   init();
262   this->file = file;
263   this->hw_reg = hw_reg;
264   this->type = type;
265}
266
267int
268brw_type_for_base_type(const struct glsl_type *type)
269{
270   switch (type->base_type) {
271   case GLSL_TYPE_FLOAT:
272      return BRW_REGISTER_TYPE_F;
273   case GLSL_TYPE_INT:
274   case GLSL_TYPE_BOOL:
275      return BRW_REGISTER_TYPE_D;
276   case GLSL_TYPE_UINT:
277      return BRW_REGISTER_TYPE_UD;
278   case GLSL_TYPE_ARRAY:
279   case GLSL_TYPE_STRUCT:
280   case GLSL_TYPE_SAMPLER:
281      /* These should be overridden with the type of the member when
282       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
283       * way to trip up if we don't.
284       */
285      return BRW_REGISTER_TYPE_UD;
286   default:
287      assert(!"not reached");
288      return BRW_REGISTER_TYPE_F;
289   }
290}
291
292/** Automatic reg constructor. */
293fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
294{
295   init();
296
297   this->file = GRF;
298   this->reg = v->virtual_grf_alloc(type_size(type));
299   this->reg_offset = 0;
300   this->type = brw_type_for_base_type(type);
301}
302
303fs_reg *
304fs_visitor::variable_storage(ir_variable *var)
305{
306   return (fs_reg *)hash_table_find(this->variable_ht, var);
307}
308
309/* Our support for uniforms is piggy-backed on the struct
310 * gl_fragment_program, because that's where the values actually
311 * get stored, rather than in some global gl_shader_program uniform
312 * store.
313 */
314int
315fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
316{
317   unsigned int offset = 0;
318   float *vec_values;
319
320   if (type->is_matrix()) {
321      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
322							type->vector_elements,
323							1);
324
325      for (unsigned int i = 0; i < type->matrix_columns; i++) {
326	 offset += setup_uniform_values(loc + offset, column);
327      }
328
329      return offset;
330   }
331
332   switch (type->base_type) {
333   case GLSL_TYPE_FLOAT:
334   case GLSL_TYPE_UINT:
335   case GLSL_TYPE_INT:
336   case GLSL_TYPE_BOOL:
337      vec_values = fp->Base.Parameters->ParameterValues[loc];
338      for (unsigned int i = 0; i < type->vector_elements; i++) {
339	 unsigned int param = c->prog_data.nr_params++;
340
341	 assert(param < ARRAY_SIZE(c->prog_data.param));
342
343	 switch (type->base_type) {
344	 case GLSL_TYPE_FLOAT:
345	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
346	    break;
347	 case GLSL_TYPE_UINT:
348	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
349	    break;
350	 case GLSL_TYPE_INT:
351	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
352	    break;
353	 case GLSL_TYPE_BOOL:
354	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
355	    break;
356	 default:
357	    assert(!"not reached");
358	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
359	    break;
360	 }
361
362	 c->prog_data.param[param] = &vec_values[i];
363      }
364      return 1;
365
366   case GLSL_TYPE_STRUCT:
367      for (unsigned int i = 0; i < type->length; i++) {
368	 offset += setup_uniform_values(loc + offset,
369					type->fields.structure[i].type);
370      }
371      return offset;
372
373   case GLSL_TYPE_ARRAY:
374      for (unsigned int i = 0; i < type->length; i++) {
375	 offset += setup_uniform_values(loc + offset, type->fields.array);
376      }
377      return offset;
378
379   case GLSL_TYPE_SAMPLER:
380      /* The sampler takes up a slot, but we don't use any values from it. */
381      return 1;
382
383   default:
384      assert(!"not reached");
385      return 0;
386   }
387}
388
389
390/* Our support for builtin uniforms is even scarier than non-builtin.
391 * It sits on top of the PROG_STATE_VAR parameters that are
392 * automatically updated from GL context state.
393 */
394void
395fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
396{
397   const struct gl_builtin_uniform_desc *statevar = NULL;
398
399   for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
400      statevar = &_mesa_builtin_uniform_desc[i];
401      if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
402	 break;
403   }
404
405   if (!statevar->name) {
406      this->fail = true;
407      printf("Failed to find builtin uniform `%s'\n", ir->name);
408      return;
409   }
410
411   int array_count;
412   if (ir->type->is_array()) {
413      array_count = ir->type->length;
414   } else {
415      array_count = 1;
416   }
417
418   for (int a = 0; a < array_count; a++) {
419      for (unsigned int i = 0; i < statevar->num_elements; i++) {
420	 struct gl_builtin_uniform_element *element = &statevar->elements[i];
421	 int tokens[STATE_LENGTH];
422
423	 memcpy(tokens, element->tokens, sizeof(element->tokens));
424	 if (ir->type->is_array()) {
425	    tokens[1] = a;
426	 }
427
428	 /* This state reference has already been setup by ir_to_mesa,
429	  * but we'll get the same index back here.
430	  */
431	 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
432					       (gl_state_index *)tokens);
433	 float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
434
435	 /* Add each of the unique swizzles of the element as a
436	  * parameter.  This'll end up matching the expected layout of
437	  * the array/matrix/structure we're trying to fill in.
438	  */
439	 int last_swiz = -1;
440	 for (unsigned int i = 0; i < 4; i++) {
441	    int swiz = GET_SWZ(element->swizzle, i);
442	    if (swiz == last_swiz)
443	       break;
444	    last_swiz = swiz;
445
446	    c->prog_data.param_convert[c->prog_data.nr_params] =
447	       PARAM_NO_CONVERT;
448	    c->prog_data.param[c->prog_data.nr_params++] = &vec_values[swiz];
449	 }
450      }
451   }
452}
453
454fs_reg *
455fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
456{
457   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
458   fs_reg wpos = *reg;
459   fs_reg neg_y = this->pixel_y;
460   neg_y.negate = true;
461   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
462
463   /* gl_FragCoord.x */
464   if (ir->pixel_center_integer) {
465      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x));
466   } else {
467      emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)));
468   }
469   wpos.reg_offset++;
470
471   /* gl_FragCoord.y */
472   if (!flip && ir->pixel_center_integer) {
473      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y));
474   } else {
475      fs_reg pixel_y = this->pixel_y;
476      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
477
478      if (flip) {
479	 pixel_y.negate = true;
480	 offset += c->key.drawable_height - 1.0;
481      }
482
483      emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)));
484   }
485   wpos.reg_offset++;
486
487   /* gl_FragCoord.z */
488   if (intel->gen >= 6) {
489      emit(fs_inst(BRW_OPCODE_MOV, wpos,
490		   fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
491   } else {
492      emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
493		   interp_reg(FRAG_ATTRIB_WPOS, 2)));
494   }
495   wpos.reg_offset++;
496
497   /* gl_FragCoord.w: Already set up in emit_interpolation */
498   emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w));
499
500   return reg;
501}
502
503fs_reg *
504fs_visitor::emit_general_interpolation(ir_variable *ir)
505{
506   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
507   /* Interpolation is always in floating point regs. */
508   reg->type = BRW_REGISTER_TYPE_F;
509   fs_reg attr = *reg;
510
511   unsigned int array_elements;
512   const glsl_type *type;
513
514   if (ir->type->is_array()) {
515      array_elements = ir->type->length;
516      if (array_elements == 0) {
517	 this->fail = true;
518      }
519      type = ir->type->fields.array;
520   } else {
521      array_elements = 1;
522      type = ir->type;
523   }
524
525   int location = ir->location;
526   for (unsigned int i = 0; i < array_elements; i++) {
527      for (unsigned int j = 0; j < type->matrix_columns; j++) {
528	 if (urb_setup[location] == -1) {
529	    /* If there's no incoming setup data for this slot, don't
530	     * emit interpolation for it.
531	     */
532	    attr.reg_offset += type->vector_elements;
533	    location++;
534	    continue;
535	 }
536
537	 for (unsigned int c = 0; c < type->vector_elements; c++) {
538	    struct brw_reg interp = interp_reg(location, c);
539	    emit(fs_inst(FS_OPCODE_LINTERP,
540			 attr,
541			 this->delta_x,
542			 this->delta_y,
543			 fs_reg(interp)));
544	    attr.reg_offset++;
545	 }
546
547	 if (intel->gen < 6) {
548	    attr.reg_offset -= type->vector_elements;
549	    for (unsigned int c = 0; c < type->vector_elements; c++) {
550	       emit(fs_inst(BRW_OPCODE_MUL,
551			    attr,
552			    attr,
553			    this->pixel_w));
554	       attr.reg_offset++;
555	    }
556	 }
557	 location++;
558      }
559   }
560
561   return reg;
562}
563
564fs_reg *
565fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
566{
567   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
568
569   /* The frontfacing comes in as a bit in the thread payload. */
570   if (intel->gen >= 6) {
571      emit(fs_inst(BRW_OPCODE_ASR,
572		   *reg,
573		   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
574		   fs_reg(15)));
575      emit(fs_inst(BRW_OPCODE_NOT,
576		   *reg,
577		   *reg));
578      emit(fs_inst(BRW_OPCODE_AND,
579		   *reg,
580		   *reg,
581		   fs_reg(1)));
582   } else {
583      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
584      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
585       * us front face
586       */
587      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP,
588				   *reg,
589				   fs_reg(r1_6ud),
590				   fs_reg(1u << 31)));
591      inst->conditional_mod = BRW_CONDITIONAL_L;
592      emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)));
593   }
594
595   return reg;
596}
597
598fs_inst *
599fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
600{
601   switch (opcode) {
602   case FS_OPCODE_RCP:
603   case FS_OPCODE_RSQ:
604   case FS_OPCODE_SQRT:
605   case FS_OPCODE_EXP2:
606   case FS_OPCODE_LOG2:
607   case FS_OPCODE_SIN:
608   case FS_OPCODE_COS:
609      break;
610   default:
611      assert(!"not reached: bad math opcode");
612      return NULL;
613   }
614
615   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
616    * might be able to do better by doing execsize = 1 math and then
617    * expanding that result out, but we would need to be careful with
618    * masking.
619    *
620    * The hardware ignores source modifiers (negate and abs) on math
621    * instructions, so we also move to a temp to set those up.
622    */
623   if (intel->gen >= 6 && (src.file == UNIFORM ||
624			   src.abs ||
625			   src.negate)) {
626      fs_reg expanded = fs_reg(this, glsl_type::float_type);
627      emit(fs_inst(BRW_OPCODE_MOV, expanded, src));
628      src = expanded;
629   }
630
631   fs_inst *inst = emit(fs_inst(opcode, dst, src));
632
633   if (intel->gen < 6) {
634      inst->base_mrf = 2;
635      inst->mlen = 1;
636   }
637
638   return inst;
639}
640
641fs_inst *
642fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
643{
644   int base_mrf = 2;
645   fs_inst *inst;
646
647   assert(opcode == FS_OPCODE_POW);
648
649   if (intel->gen >= 6) {
650      /* Can't do hstride == 0 args to gen6 math, so expand it out. */
651      if (src0.file == UNIFORM) {
652	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
653	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src0));
654	 src0 = expanded;
655      }
656
657      if (src1.file == UNIFORM) {
658	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
659	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src1));
660	 src1 = expanded;
661      }
662
663      inst = emit(fs_inst(opcode, dst, src0, src1));
664   } else {
665      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1));
666      inst = emit(fs_inst(opcode, dst, src0, reg_null_f));
667
668      inst->base_mrf = base_mrf;
669      inst->mlen = 2;
670   }
671   return inst;
672}
673
674void
675fs_visitor::visit(ir_variable *ir)
676{
677   fs_reg *reg = NULL;
678
679   if (variable_storage(ir))
680      return;
681
682   if (strcmp(ir->name, "gl_FragColor") == 0) {
683      this->frag_color = ir;
684   } else if (strcmp(ir->name, "gl_FragData") == 0) {
685      this->frag_data = ir;
686   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
687      this->frag_depth = ir;
688   }
689
690   if (ir->mode == ir_var_in) {
691      if (!strcmp(ir->name, "gl_FragCoord")) {
692	 reg = emit_fragcoord_interpolation(ir);
693      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
694	 reg = emit_frontfacing_interpolation(ir);
695      } else {
696	 reg = emit_general_interpolation(ir);
697      }
698      assert(reg);
699      hash_table_insert(this->variable_ht, reg, ir);
700      return;
701   }
702
703   if (ir->mode == ir_var_uniform) {
704      int param_index = c->prog_data.nr_params;
705
706      if (!strncmp(ir->name, "gl_", 3)) {
707	 setup_builtin_uniform_values(ir);
708      } else {
709	 setup_uniform_values(ir->location, ir->type);
710      }
711
712      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
713      reg->type = brw_type_for_base_type(ir->type);
714   }
715
716   if (!reg)
717      reg = new(this->mem_ctx) fs_reg(this, ir->type);
718
719   hash_table_insert(this->variable_ht, reg, ir);
720}
721
722void
723fs_visitor::visit(ir_dereference_variable *ir)
724{
725   fs_reg *reg = variable_storage(ir->var);
726   this->result = *reg;
727}
728
729void
730fs_visitor::visit(ir_dereference_record *ir)
731{
732   const glsl_type *struct_type = ir->record->type;
733
734   ir->record->accept(this);
735
736   unsigned int offset = 0;
737   for (unsigned int i = 0; i < struct_type->length; i++) {
738      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
739	 break;
740      offset += type_size(struct_type->fields.structure[i].type);
741   }
742   this->result.reg_offset += offset;
743   this->result.type = brw_type_for_base_type(ir->type);
744}
745
746void
747fs_visitor::visit(ir_dereference_array *ir)
748{
749   ir_constant *index;
750   int element_size;
751
752   ir->array->accept(this);
753   index = ir->array_index->as_constant();
754
755   element_size = type_size(ir->type);
756   this->result.type = brw_type_for_base_type(ir->type);
757
758   if (index) {
759      assert(this->result.file == UNIFORM ||
760	     (this->result.file == GRF &&
761	      this->result.reg != 0));
762      this->result.reg_offset += index->value.i[0] * element_size;
763   } else {
764      assert(!"FINISHME: non-constant array element");
765   }
766}
767
768/* Instruction selection: Produce a MOV.sat instead of
769 * MIN(MAX(val, 0), 1) when possible.
770 */
771bool
772fs_visitor::try_emit_saturate(ir_expression *ir)
773{
774   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
775
776   if (!sat_val)
777      return false;
778
779   sat_val->accept(this);
780   fs_reg src = this->result;
781
782   this->result = fs_reg(this, ir->type);
783   fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, src));
784   inst->saturate = true;
785
786   return true;
787}
788
789static uint32_t
790brw_conditional_for_comparison(unsigned int op)
791{
792   switch (op) {
793   case ir_binop_less:
794      return BRW_CONDITIONAL_L;
795   case ir_binop_greater:
796      return BRW_CONDITIONAL_G;
797   case ir_binop_lequal:
798      return BRW_CONDITIONAL_LE;
799   case ir_binop_gequal:
800      return BRW_CONDITIONAL_GE;
801   case ir_binop_equal:
802   case ir_binop_all_equal: /* same as equal for scalars */
803      return BRW_CONDITIONAL_Z;
804   case ir_binop_nequal:
805   case ir_binop_any_nequal: /* same as nequal for scalars */
806      return BRW_CONDITIONAL_NZ;
807   default:
808      assert(!"not reached: bad operation for comparison");
809      return BRW_CONDITIONAL_NZ;
810   }
811}
812
813void
814fs_visitor::visit(ir_expression *ir)
815{
816   unsigned int operand;
817   fs_reg op[2], temp;
818   fs_inst *inst;
819
820   assert(ir->get_num_operands() <= 2);
821
822   if (try_emit_saturate(ir))
823      return;
824
825   for (operand = 0; operand < ir->get_num_operands(); operand++) {
826      ir->operands[operand]->accept(this);
827      if (this->result.file == BAD_FILE) {
828	 ir_print_visitor v;
829	 printf("Failed to get tree for expression operand:\n");
830	 ir->operands[operand]->accept(&v);
831	 this->fail = true;
832      }
833      op[operand] = this->result;
834
835      /* Matrix expression operands should have been broken down to vector
836       * operations already.
837       */
838      assert(!ir->operands[operand]->type->is_matrix());
839      /* And then those vector operands should have been broken down to scalar.
840       */
841      assert(!ir->operands[operand]->type->is_vector());
842   }
843
844   /* Storage for our result.  If our result goes into an assignment, it will
845    * just get copy-propagated out, so no worries.
846    */
847   this->result = fs_reg(this, ir->type);
848
849   switch (ir->operation) {
850   case ir_unop_logic_not:
851      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
852       * ones complement of the whole register, not just bit 0.
853       */
854      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)));
855      break;
856   case ir_unop_neg:
857      op[0].negate = !op[0].negate;
858      this->result = op[0];
859      break;
860   case ir_unop_abs:
861      op[0].abs = true;
862      this->result = op[0];
863      break;
864   case ir_unop_sign:
865      temp = fs_reg(this, ir->type);
866
867      emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)));
868
869      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
870      inst->conditional_mod = BRW_CONDITIONAL_G;
871      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)));
872      inst->predicated = true;
873
874      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
875      inst->conditional_mod = BRW_CONDITIONAL_L;
876      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)));
877      inst->predicated = true;
878
879      break;
880   case ir_unop_rcp:
881      emit_math(FS_OPCODE_RCP, this->result, op[0]);
882      break;
883
884   case ir_unop_exp2:
885      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
886      break;
887   case ir_unop_log2:
888      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
889      break;
890   case ir_unop_exp:
891   case ir_unop_log:
892      assert(!"not reached: should be handled by ir_explog_to_explog2");
893      break;
894   case ir_unop_sin:
895   case ir_unop_sin_reduced:
896      emit_math(FS_OPCODE_SIN, this->result, op[0]);
897      break;
898   case ir_unop_cos:
899   case ir_unop_cos_reduced:
900      emit_math(FS_OPCODE_COS, this->result, op[0]);
901      break;
902
903   case ir_unop_dFdx:
904      emit(fs_inst(FS_OPCODE_DDX, this->result, op[0]));
905      break;
906   case ir_unop_dFdy:
907      emit(fs_inst(FS_OPCODE_DDY, this->result, op[0]));
908      break;
909
910   case ir_binop_add:
911      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1]));
912      break;
913   case ir_binop_sub:
914      assert(!"not reached: should be handled by ir_sub_to_add_neg");
915      break;
916
917   case ir_binop_mul:
918      emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1]));
919      break;
920   case ir_binop_div:
921      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
922      break;
923   case ir_binop_mod:
924      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
925      break;
926
927   case ir_binop_less:
928   case ir_binop_greater:
929   case ir_binop_lequal:
930   case ir_binop_gequal:
931   case ir_binop_equal:
932   case ir_binop_all_equal:
933   case ir_binop_nequal:
934   case ir_binop_any_nequal:
935      temp = this->result;
936      /* original gen4 does implicit conversion before comparison. */
937      if (intel->gen < 5)
938	 temp.type = op[0].type;
939
940      inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], op[1]));
941      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
942      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
943      break;
944
945   case ir_binop_logic_xor:
946      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
947      break;
948
949   case ir_binop_logic_or:
950      emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
951      break;
952
953   case ir_binop_logic_and:
954      emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
955      break;
956
957   case ir_binop_dot:
958   case ir_unop_any:
959      assert(!"not reached: should be handled by brw_fs_channel_expressions");
960      break;
961
962   case ir_unop_noise:
963      assert(!"not reached: should be handled by lower_noise");
964      break;
965
966   case ir_quadop_vector:
967      assert(!"not reached: should be handled by lower_quadop_vector");
968      break;
969
970   case ir_unop_sqrt:
971      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
972      break;
973
974   case ir_unop_rsq:
975      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
976      break;
977
978   case ir_unop_i2f:
979   case ir_unop_b2f:
980   case ir_unop_b2i:
981   case ir_unop_f2i:
982      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
983      break;
984   case ir_unop_f2b:
985   case ir_unop_i2b:
986      temp = this->result;
987      /* original gen4 does implicit conversion before comparison. */
988      if (intel->gen < 5)
989	 temp.type = op[0].type;
990
991      inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)));
992      inst->conditional_mod = BRW_CONDITIONAL_NZ;
993      inst = emit(fs_inst(BRW_OPCODE_AND, this->result,
994			  this->result, fs_reg(1)));
995      break;
996
997   case ir_unop_trunc:
998      emit(fs_inst(BRW_OPCODE_RNDZ, this->result, op[0]));
999      break;
1000   case ir_unop_ceil:
1001      op[0].negate = !op[0].negate;
1002      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1003      this->result.negate = true;
1004      break;
1005   case ir_unop_floor:
1006      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1007      break;
1008   case ir_unop_fract:
1009      inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0]));
1010      break;
1011   case ir_unop_round_even:
1012      emit(fs_inst(BRW_OPCODE_RNDE, this->result, op[0]));
1013      break;
1014
1015   case ir_binop_min:
1016      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1017      inst->conditional_mod = BRW_CONDITIONAL_L;
1018
1019      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
1020      inst->predicated = true;
1021      break;
1022   case ir_binop_max:
1023      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1024      inst->conditional_mod = BRW_CONDITIONAL_G;
1025
1026      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
1027      inst->predicated = true;
1028      break;
1029
1030   case ir_binop_pow:
1031      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1032      break;
1033
1034   case ir_unop_bit_not:
1035      inst = emit(fs_inst(BRW_OPCODE_NOT, this->result, op[0]));
1036      break;
1037   case ir_binop_bit_and:
1038      inst = emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
1039      break;
1040   case ir_binop_bit_xor:
1041      inst = emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
1042      break;
1043   case ir_binop_bit_or:
1044      inst = emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
1045      break;
1046
1047   case ir_unop_u2f:
1048   case ir_binop_lshift:
1049   case ir_binop_rshift:
1050      assert(!"GLSL 1.30 features unsupported");
1051      break;
1052   }
1053}
1054
1055void
1056fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1057				   const glsl_type *type, bool predicated)
1058{
1059   switch (type->base_type) {
1060   case GLSL_TYPE_FLOAT:
1061   case GLSL_TYPE_UINT:
1062   case GLSL_TYPE_INT:
1063   case GLSL_TYPE_BOOL:
1064      for (unsigned int i = 0; i < type->components(); i++) {
1065	 l.type = brw_type_for_base_type(type);
1066	 r.type = brw_type_for_base_type(type);
1067
1068	 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1069	 inst->predicated = predicated;
1070
1071	 l.reg_offset++;
1072	 r.reg_offset++;
1073      }
1074      break;
1075   case GLSL_TYPE_ARRAY:
1076      for (unsigned int i = 0; i < type->length; i++) {
1077	 emit_assignment_writes(l, r, type->fields.array, predicated);
1078      }
1079      break;
1080
1081   case GLSL_TYPE_STRUCT:
1082      for (unsigned int i = 0; i < type->length; i++) {
1083	 emit_assignment_writes(l, r, type->fields.structure[i].type,
1084				predicated);
1085      }
1086      break;
1087
1088   case GLSL_TYPE_SAMPLER:
1089      break;
1090
1091   default:
1092      assert(!"not reached");
1093      break;
1094   }
1095}
1096
1097void
1098fs_visitor::visit(ir_assignment *ir)
1099{
1100   struct fs_reg l, r;
1101   fs_inst *inst;
1102
1103   /* FINISHME: arrays on the lhs */
1104   ir->lhs->accept(this);
1105   l = this->result;
1106
1107   ir->rhs->accept(this);
1108   r = this->result;
1109
1110   assert(l.file != BAD_FILE);
1111   assert(r.file != BAD_FILE);
1112
1113   if (ir->condition) {
1114      emit_bool_to_cond_code(ir->condition);
1115   }
1116
1117   if (ir->lhs->type->is_scalar() ||
1118       ir->lhs->type->is_vector()) {
1119      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1120	 if (ir->write_mask & (1 << i)) {
1121	    inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1122	    if (ir->condition)
1123	       inst->predicated = true;
1124	    r.reg_offset++;
1125	 }
1126	 l.reg_offset++;
1127      }
1128   } else {
1129      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1130   }
1131}
1132
1133fs_inst *
1134fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1135{
1136   int mlen;
1137   int base_mrf = 1;
1138   bool simd16 = false;
1139   fs_reg orig_dst;
1140
1141   /* g0 header. */
1142   mlen = 1;
1143
1144   if (ir->shadow_comparitor) {
1145      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1146	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1147		      coordinate));
1148	 coordinate.reg_offset++;
1149      }
1150      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1151      mlen += 3;
1152
1153      if (ir->op == ir_tex) {
1154	 /* There's no plain shadow compare message, so we use shadow
1155	  * compare with a bias of 0.0.
1156	  */
1157	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1158		      fs_reg(0.0f)));
1159	 mlen++;
1160      } else if (ir->op == ir_txb) {
1161	 ir->lod_info.bias->accept(this);
1162	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1163		      this->result));
1164	 mlen++;
1165      } else {
1166	 assert(ir->op == ir_txl);
1167	 ir->lod_info.lod->accept(this);
1168	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1169		      this->result));
1170	 mlen++;
1171      }
1172
1173      ir->shadow_comparitor->accept(this);
1174      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1175      mlen++;
1176   } else if (ir->op == ir_tex) {
1177      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1178	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1179		      coordinate));
1180	 coordinate.reg_offset++;
1181      }
1182      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1183      mlen += 3;
1184   } else {
1185      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1186       * instructions.  We'll need to do SIMD16 here.
1187       */
1188      assert(ir->op == ir_txb || ir->op == ir_txl);
1189
1190      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1191	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2),
1192		      coordinate));
1193	 coordinate.reg_offset++;
1194      }
1195
1196      /* lod/bias appears after u/v/r. */
1197      mlen += 6;
1198
1199      if (ir->op == ir_txb) {
1200	 ir->lod_info.bias->accept(this);
1201	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1202		      this->result));
1203	 mlen++;
1204      } else {
1205	 ir->lod_info.lod->accept(this);
1206	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1207		      this->result));
1208	 mlen++;
1209      }
1210
1211      /* The unused upper half. */
1212      mlen++;
1213
1214      /* Now, since we're doing simd16, the return is 2 interleaved
1215       * vec4s where the odd-indexed ones are junk. We'll need to move
1216       * this weirdness around to the expected layout.
1217       */
1218      simd16 = true;
1219      orig_dst = dst;
1220      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1221						       2));
1222      dst.type = BRW_REGISTER_TYPE_F;
1223   }
1224
1225   fs_inst *inst = NULL;
1226   switch (ir->op) {
1227   case ir_tex:
1228      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1229      break;
1230   case ir_txb:
1231      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1232      break;
1233   case ir_txl:
1234      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1235      break;
1236   case ir_txd:
1237   case ir_txf:
1238      assert(!"GLSL 1.30 features unsupported");
1239      break;
1240   }
1241   inst->base_mrf = base_mrf;
1242   inst->mlen = mlen;
1243
1244   if (simd16) {
1245      for (int i = 0; i < 4; i++) {
1246	 emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst));
1247	 orig_dst.reg_offset++;
1248	 dst.reg_offset += 2;
1249      }
1250   }
1251
1252   return inst;
1253}
1254
1255fs_inst *
1256fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1257{
1258   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1259    * optional parameters like shadow comparitor or LOD bias.  If
1260    * optional parameters aren't present, those base slots are
1261    * optional and don't need to be included in the message.
1262    *
1263    * We don't fill in the unnecessary slots regardless, which may
1264    * look surprising in the disassembly.
1265    */
1266   int mlen = 1; /* g0 header always present. */
1267   int base_mrf = 1;
1268
1269   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1270      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1271		   coordinate));
1272      coordinate.reg_offset++;
1273   }
1274   mlen += ir->coordinate->type->vector_elements;
1275
1276   if (ir->shadow_comparitor) {
1277      mlen = MAX2(mlen, 5);
1278
1279      ir->shadow_comparitor->accept(this);
1280      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1281      mlen++;
1282   }
1283
1284   fs_inst *inst = NULL;
1285   switch (ir->op) {
1286   case ir_tex:
1287      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1288      break;
1289   case ir_txb:
1290      ir->lod_info.bias->accept(this);
1291      mlen = MAX2(mlen, 5);
1292      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1293      mlen++;
1294
1295      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1296      break;
1297   case ir_txl:
1298      ir->lod_info.lod->accept(this);
1299      mlen = MAX2(mlen, 5);
1300      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1301      mlen++;
1302
1303      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1304      break;
1305   case ir_txd:
1306   case ir_txf:
1307      assert(!"GLSL 1.30 features unsupported");
1308      break;
1309   }
1310   inst->base_mrf = base_mrf;
1311   inst->mlen = mlen;
1312
1313   return inst;
1314}
1315
1316void
1317fs_visitor::visit(ir_texture *ir)
1318{
1319   int sampler;
1320   fs_inst *inst = NULL;
1321
1322   ir->coordinate->accept(this);
1323   fs_reg coordinate = this->result;
1324
1325   /* Should be lowered by do_lower_texture_projection */
1326   assert(!ir->projector);
1327
1328   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1329					     ctx->Shader.CurrentFragmentProgram,
1330					     &brw->fragment_program->Base);
1331   sampler = c->fp->program.Base.SamplerUnits[sampler];
1332
1333   /* The 965 requires the EU to do the normalization of GL rectangle
1334    * texture coordinates.  We use the program parameter state
1335    * tracking to get the scaling factor.
1336    */
1337   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1338      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1339      int tokens[STATE_LENGTH] = {
1340	 STATE_INTERNAL,
1341	 STATE_TEXRECT_SCALE,
1342	 sampler,
1343	 0,
1344	 0
1345      };
1346
1347      c->prog_data.param_convert[c->prog_data.nr_params] =
1348	 PARAM_NO_CONVERT;
1349      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1350	 PARAM_NO_CONVERT;
1351
1352      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1353      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1354      GLuint index = _mesa_add_state_reference(params,
1355					       (gl_state_index *)tokens);
1356      float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
1357
1358      c->prog_data.param[c->prog_data.nr_params++] = &vec_values[0];
1359      c->prog_data.param[c->prog_data.nr_params++] = &vec_values[1];
1360
1361      fs_reg dst = fs_reg(this, ir->coordinate->type);
1362      fs_reg src = coordinate;
1363      coordinate = dst;
1364
1365      emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_x));
1366      dst.reg_offset++;
1367      src.reg_offset++;
1368      emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_y));
1369   }
1370
1371   /* Writemasking doesn't eliminate channels on SIMD8 texture
1372    * samples, so don't worry about them.
1373    */
1374   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1375
1376   if (intel->gen < 5) {
1377      inst = emit_texture_gen4(ir, dst, coordinate);
1378   } else {
1379      inst = emit_texture_gen5(ir, dst, coordinate);
1380   }
1381
1382   inst->sampler = sampler;
1383
1384   this->result = dst;
1385
1386   if (ir->shadow_comparitor)
1387      inst->shadow_compare = true;
1388
1389   if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1390      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1391
1392      for (int i = 0; i < 4; i++) {
1393	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1394	 fs_reg l = swizzle_dst;
1395	 l.reg_offset += i;
1396
1397	 if (swiz == SWIZZLE_ZERO) {
1398	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f)));
1399	 } else if (swiz == SWIZZLE_ONE) {
1400	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f)));
1401	 } else {
1402	    fs_reg r = dst;
1403	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1404	    emit(fs_inst(BRW_OPCODE_MOV, l, r));
1405	 }
1406      }
1407      this->result = swizzle_dst;
1408   }
1409}
1410
1411void
1412fs_visitor::visit(ir_swizzle *ir)
1413{
1414   ir->val->accept(this);
1415   fs_reg val = this->result;
1416
1417   if (ir->type->vector_elements == 1) {
1418      this->result.reg_offset += ir->mask.x;
1419      return;
1420   }
1421
1422   fs_reg result = fs_reg(this, ir->type);
1423   this->result = result;
1424
1425   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1426      fs_reg channel = val;
1427      int swiz = 0;
1428
1429      switch (i) {
1430      case 0:
1431	 swiz = ir->mask.x;
1432	 break;
1433      case 1:
1434	 swiz = ir->mask.y;
1435	 break;
1436      case 2:
1437	 swiz = ir->mask.z;
1438	 break;
1439      case 3:
1440	 swiz = ir->mask.w;
1441	 break;
1442      }
1443
1444      channel.reg_offset += swiz;
1445      emit(fs_inst(BRW_OPCODE_MOV, result, channel));
1446      result.reg_offset++;
1447   }
1448}
1449
1450void
1451fs_visitor::visit(ir_discard *ir)
1452{
1453   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1454
1455   assert(ir->condition == NULL); /* FINISHME */
1456
1457   emit(fs_inst(FS_OPCODE_DISCARD_NOT, temp, reg_null_d));
1458   emit(fs_inst(FS_OPCODE_DISCARD_AND, reg_null_d, temp));
1459   kill_emitted = true;
1460}
1461
1462void
1463fs_visitor::visit(ir_constant *ir)
1464{
1465   /* Set this->result to reg at the bottom of the function because some code
1466    * paths will cause this visitor to be applied to other fields.  This will
1467    * cause the value stored in this->result to be modified.
1468    *
1469    * Make reg constant so that it doesn't get accidentally modified along the
1470    * way.  Yes, I actually had this problem. :(
1471    */
1472   const fs_reg reg(this, ir->type);
1473   fs_reg dst_reg = reg;
1474
1475   if (ir->type->is_array()) {
1476      const unsigned size = type_size(ir->type->fields.array);
1477
1478      for (unsigned i = 0; i < ir->type->length; i++) {
1479	 ir->array_elements[i]->accept(this);
1480	 fs_reg src_reg = this->result;
1481
1482	 dst_reg.type = src_reg.type;
1483	 for (unsigned j = 0; j < size; j++) {
1484	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
1485	    src_reg.reg_offset++;
1486	    dst_reg.reg_offset++;
1487	 }
1488      }
1489   } else if (ir->type->is_record()) {
1490      foreach_list(node, &ir->components) {
1491	 ir_instruction *const field = (ir_instruction *) node;
1492	 const unsigned size = type_size(field->type);
1493
1494	 field->accept(this);
1495	 fs_reg src_reg = this->result;
1496
1497	 dst_reg.type = src_reg.type;
1498	 for (unsigned j = 0; j < size; j++) {
1499	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
1500	    src_reg.reg_offset++;
1501	    dst_reg.reg_offset++;
1502	 }
1503      }
1504   } else {
1505      const unsigned size = type_size(ir->type);
1506
1507      for (unsigned i = 0; i < size; i++) {
1508	 switch (ir->type->base_type) {
1509	 case GLSL_TYPE_FLOAT:
1510	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])));
1511	    break;
1512	 case GLSL_TYPE_UINT:
1513	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])));
1514	    break;
1515	 case GLSL_TYPE_INT:
1516	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])));
1517	    break;
1518	 case GLSL_TYPE_BOOL:
1519	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])));
1520	    break;
1521	 default:
1522	    assert(!"Non-float/uint/int/bool constant");
1523	 }
1524	 dst_reg.reg_offset++;
1525      }
1526   }
1527
1528   this->result = reg;
1529}
1530
1531void
1532fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1533{
1534   ir_expression *expr = ir->as_expression();
1535
1536   if (expr) {
1537      fs_reg op[2];
1538      fs_inst *inst;
1539
1540      assert(expr->get_num_operands() <= 2);
1541      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1542	 assert(expr->operands[i]->type->is_scalar());
1543
1544	 expr->operands[i]->accept(this);
1545	 op[i] = this->result;
1546      }
1547
1548      switch (expr->operation) {
1549      case ir_unop_logic_not:
1550	 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)));
1551	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1552	 break;
1553
1554      case ir_binop_logic_xor:
1555	 inst = emit(fs_inst(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]));
1556	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1557	 break;
1558
1559      case ir_binop_logic_or:
1560	 inst = emit(fs_inst(BRW_OPCODE_OR, reg_null_d, op[0], op[1]));
1561	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1562	 break;
1563
1564      case ir_binop_logic_and:
1565	 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], op[1]));
1566	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1567	 break;
1568
1569      case ir_unop_f2b:
1570	 if (intel->gen >= 6) {
1571	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d,
1572				op[0], fs_reg(0.0f)));
1573	 } else {
1574	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_f, op[0]));
1575	 }
1576	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1577	 break;
1578
1579      case ir_unop_i2b:
1580	 if (intel->gen >= 6) {
1581	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)));
1582	 } else {
1583	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0]));
1584	 }
1585	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1586	 break;
1587
1588      case ir_binop_greater:
1589      case ir_binop_gequal:
1590      case ir_binop_less:
1591      case ir_binop_lequal:
1592      case ir_binop_equal:
1593      case ir_binop_all_equal:
1594      case ir_binop_nequal:
1595      case ir_binop_any_nequal:
1596	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]));
1597	 inst->conditional_mod =
1598	    brw_conditional_for_comparison(expr->operation);
1599	 break;
1600
1601      default:
1602	 assert(!"not reached");
1603	 this->fail = true;
1604	 break;
1605      }
1606      return;
1607   }
1608
1609   ir->accept(this);
1610
1611   if (intel->gen >= 6) {
1612      fs_inst *inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d,
1613				   this->result, fs_reg(1)));
1614      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1615   } else {
1616      fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, this->result));
1617      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1618   }
1619}
1620
1621/**
1622 * Emit a gen6 IF statement with the comparison folded into the IF
1623 * instruction.
1624 */
1625void
1626fs_visitor::emit_if_gen6(ir_if *ir)
1627{
1628   ir_expression *expr = ir->condition->as_expression();
1629
1630   if (expr) {
1631      fs_reg op[2];
1632      fs_inst *inst;
1633      fs_reg temp;
1634
1635      assert(expr->get_num_operands() <= 2);
1636      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1637	 assert(expr->operands[i]->type->is_scalar());
1638
1639	 expr->operands[i]->accept(this);
1640	 op[i] = this->result;
1641      }
1642
1643      switch (expr->operation) {
1644      case ir_unop_logic_not:
1645	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(0)));
1646	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1647	 return;
1648
1649      case ir_binop_logic_xor:
1650	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1651	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1652	 return;
1653
1654      case ir_binop_logic_or:
1655	 temp = fs_reg(this, glsl_type::bool_type);
1656	 emit(fs_inst(BRW_OPCODE_OR, temp, op[0], op[1]));
1657	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1658	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1659	 return;
1660
1661      case ir_binop_logic_and:
1662	 temp = fs_reg(this, glsl_type::bool_type);
1663	 emit(fs_inst(BRW_OPCODE_AND, temp, op[0], op[1]));
1664	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1665	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1666	 return;
1667
1668      case ir_unop_f2b:
1669	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)));
1670	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1671	 return;
1672
1673      case ir_unop_i2b:
1674	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1675	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1676	 return;
1677
1678      case ir_binop_greater:
1679      case ir_binop_gequal:
1680      case ir_binop_less:
1681      case ir_binop_lequal:
1682      case ir_binop_equal:
1683      case ir_binop_all_equal:
1684      case ir_binop_nequal:
1685      case ir_binop_any_nequal:
1686	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1687	 inst->conditional_mod =
1688	    brw_conditional_for_comparison(expr->operation);
1689	 return;
1690      default:
1691	 assert(!"not reached");
1692	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1693	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1694	 this->fail = true;
1695	 return;
1696      }
1697      return;
1698   }
1699
1700   ir->condition->accept(this);
1701
1702   fs_inst *inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)));
1703   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1704}
1705
1706void
1707fs_visitor::visit(ir_if *ir)
1708{
1709   fs_inst *inst;
1710
1711   /* Don't point the annotation at the if statement, because then it plus
1712    * the then and else blocks get printed.
1713    */
1714   this->base_ir = ir->condition;
1715
1716   if (intel->gen >= 6) {
1717      emit_if_gen6(ir);
1718   } else {
1719      emit_bool_to_cond_code(ir->condition);
1720
1721      inst = emit(fs_inst(BRW_OPCODE_IF));
1722      inst->predicated = true;
1723   }
1724
1725   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1726      ir_instruction *ir = (ir_instruction *)iter.get();
1727      this->base_ir = ir;
1728
1729      ir->accept(this);
1730   }
1731
1732   if (!ir->else_instructions.is_empty()) {
1733      emit(fs_inst(BRW_OPCODE_ELSE));
1734
1735      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1736	 ir_instruction *ir = (ir_instruction *)iter.get();
1737	 this->base_ir = ir;
1738
1739	 ir->accept(this);
1740      }
1741   }
1742
1743   emit(fs_inst(BRW_OPCODE_ENDIF));
1744}
1745
1746void
1747fs_visitor::visit(ir_loop *ir)
1748{
1749   fs_reg counter = reg_undef;
1750
1751   if (ir->counter) {
1752      this->base_ir = ir->counter;
1753      ir->counter->accept(this);
1754      counter = *(variable_storage(ir->counter));
1755
1756      if (ir->from) {
1757	 this->base_ir = ir->from;
1758	 ir->from->accept(this);
1759
1760	 emit(fs_inst(BRW_OPCODE_MOV, counter, this->result));
1761      }
1762   }
1763
1764   emit(fs_inst(BRW_OPCODE_DO));
1765
1766   if (ir->to) {
1767      this->base_ir = ir->to;
1768      ir->to->accept(this);
1769
1770      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp,
1771				   counter, this->result));
1772      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1773
1774      inst = emit(fs_inst(BRW_OPCODE_BREAK));
1775      inst->predicated = true;
1776   }
1777
1778   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1779      ir_instruction *ir = (ir_instruction *)iter.get();
1780
1781      this->base_ir = ir;
1782      ir->accept(this);
1783   }
1784
1785   if (ir->increment) {
1786      this->base_ir = ir->increment;
1787      ir->increment->accept(this);
1788      emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result));
1789   }
1790
1791   emit(fs_inst(BRW_OPCODE_WHILE));
1792}
1793
1794void
1795fs_visitor::visit(ir_loop_jump *ir)
1796{
1797   switch (ir->mode) {
1798   case ir_loop_jump::jump_break:
1799      emit(fs_inst(BRW_OPCODE_BREAK));
1800      break;
1801   case ir_loop_jump::jump_continue:
1802      emit(fs_inst(BRW_OPCODE_CONTINUE));
1803      break;
1804   }
1805}
1806
1807void
1808fs_visitor::visit(ir_call *ir)
1809{
1810   assert(!"FINISHME");
1811}
1812
1813void
1814fs_visitor::visit(ir_return *ir)
1815{
1816   assert(!"FINISHME");
1817}
1818
1819void
1820fs_visitor::visit(ir_function *ir)
1821{
1822   /* Ignore function bodies other than main() -- we shouldn't see calls to
1823    * them since they should all be inlined before we get to ir_to_mesa.
1824    */
1825   if (strcmp(ir->name, "main") == 0) {
1826      const ir_function_signature *sig;
1827      exec_list empty;
1828
1829      sig = ir->matching_signature(&empty);
1830
1831      assert(sig);
1832
1833      foreach_iter(exec_list_iterator, iter, sig->body) {
1834	 ir_instruction *ir = (ir_instruction *)iter.get();
1835	 this->base_ir = ir;
1836
1837	 ir->accept(this);
1838      }
1839   }
1840}
1841
1842void
1843fs_visitor::visit(ir_function_signature *ir)
1844{
1845   assert(!"not reached");
1846   (void)ir;
1847}
1848
1849fs_inst *
1850fs_visitor::emit(fs_inst inst)
1851{
1852   fs_inst *list_inst = new(mem_ctx) fs_inst;
1853   *list_inst = inst;
1854
1855   list_inst->annotation = this->current_annotation;
1856   list_inst->ir = this->base_ir;
1857
1858   this->instructions.push_tail(list_inst);
1859
1860   return list_inst;
1861}
1862
1863/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1864void
1865fs_visitor::emit_dummy_fs()
1866{
1867   /* Everyone's favorite color. */
1868   emit(fs_inst(BRW_OPCODE_MOV,
1869		fs_reg(MRF, 2),
1870		fs_reg(1.0f)));
1871   emit(fs_inst(BRW_OPCODE_MOV,
1872		fs_reg(MRF, 3),
1873		fs_reg(0.0f)));
1874   emit(fs_inst(BRW_OPCODE_MOV,
1875		fs_reg(MRF, 4),
1876		fs_reg(1.0f)));
1877   emit(fs_inst(BRW_OPCODE_MOV,
1878		fs_reg(MRF, 5),
1879		fs_reg(0.0f)));
1880
1881   fs_inst *write;
1882   write = emit(fs_inst(FS_OPCODE_FB_WRITE,
1883			fs_reg(0),
1884			fs_reg(0)));
1885   write->base_mrf = 0;
1886}
1887
1888/* The register location here is relative to the start of the URB
1889 * data.  It will get adjusted to be a real location before
1890 * generate_code() time.
1891 */
1892struct brw_reg
1893fs_visitor::interp_reg(int location, int channel)
1894{
1895   int regnr = urb_setup[location] * 2 + channel / 2;
1896   int stride = (channel & 1) * 4;
1897
1898   assert(urb_setup[location] != -1);
1899
1900   return brw_vec1_grf(regnr, stride);
1901}
1902
1903/** Emits the interpolation for the varying inputs. */
1904void
1905fs_visitor::emit_interpolation_setup_gen4()
1906{
1907   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1908
1909   this->current_annotation = "compute pixel centers";
1910   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1911   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1912   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1913   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1914   emit(fs_inst(BRW_OPCODE_ADD,
1915		this->pixel_x,
1916		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1917		fs_reg(brw_imm_v(0x10101010))));
1918   emit(fs_inst(BRW_OPCODE_ADD,
1919		this->pixel_y,
1920		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1921		fs_reg(brw_imm_v(0x11001100))));
1922
1923   this->current_annotation = "compute pixel deltas from v0";
1924   if (brw->has_pln) {
1925      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1926      this->delta_y = this->delta_x;
1927      this->delta_y.reg_offset++;
1928   } else {
1929      this->delta_x = fs_reg(this, glsl_type::float_type);
1930      this->delta_y = fs_reg(this, glsl_type::float_type);
1931   }
1932   emit(fs_inst(BRW_OPCODE_ADD,
1933		this->delta_x,
1934		this->pixel_x,
1935		fs_reg(negate(brw_vec1_grf(1, 0)))));
1936   emit(fs_inst(BRW_OPCODE_ADD,
1937		this->delta_y,
1938		this->pixel_y,
1939		fs_reg(negate(brw_vec1_grf(1, 1)))));
1940
1941   this->current_annotation = "compute pos.w and 1/pos.w";
1942   /* Compute wpos.w.  It's always in our setup, since it's needed to
1943    * interpolate the other attributes.
1944    */
1945   this->wpos_w = fs_reg(this, glsl_type::float_type);
1946   emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1947		interp_reg(FRAG_ATTRIB_WPOS, 3)));
1948   /* Compute the pixel 1/W value from wpos.w. */
1949   this->pixel_w = fs_reg(this, glsl_type::float_type);
1950   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1951   this->current_annotation = NULL;
1952}
1953
1954/** Emits the interpolation for the varying inputs. */
1955void
1956fs_visitor::emit_interpolation_setup_gen6()
1957{
1958   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1959
1960   /* If the pixel centers end up used, the setup is the same as for gen4. */
1961   this->current_annotation = "compute pixel centers";
1962   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1963   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1964   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1965   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1966   emit(fs_inst(BRW_OPCODE_ADD,
1967		int_pixel_x,
1968		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1969		fs_reg(brw_imm_v(0x10101010))));
1970   emit(fs_inst(BRW_OPCODE_ADD,
1971		int_pixel_y,
1972		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1973		fs_reg(brw_imm_v(0x11001100))));
1974
1975   /* As of gen6, we can no longer mix float and int sources.  We have
1976    * to turn the integer pixel centers into floats for their actual
1977    * use.
1978    */
1979   this->pixel_x = fs_reg(this, glsl_type::float_type);
1980   this->pixel_y = fs_reg(this, glsl_type::float_type);
1981   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x));
1982   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y));
1983
1984   this->current_annotation = "compute 1/pos.w";
1985   this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
1986   this->pixel_w = fs_reg(this, glsl_type::float_type);
1987   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1988
1989   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
1990   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
1991
1992   this->current_annotation = NULL;
1993}
1994
1995void
1996fs_visitor::emit_fb_writes()
1997{
1998   this->current_annotation = "FB write header";
1999   GLboolean header_present = GL_TRUE;
2000   int nr = 0;
2001
2002   if (intel->gen >= 6 &&
2003       !this->kill_emitted &&
2004       c->key.nr_color_regions == 1) {
2005      header_present = false;
2006   }
2007
2008   if (header_present) {
2009      /* m0, m1 header */
2010      nr += 2;
2011   }
2012
2013   if (c->aa_dest_stencil_reg) {
2014      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2015		   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
2016   }
2017
2018   /* Reserve space for color. It'll be filled in per MRT below. */
2019   int color_mrf = nr;
2020   nr += 4;
2021
2022   if (c->source_depth_to_render_target) {
2023      if (c->computes_depth) {
2024	 /* Hand over gl_FragDepth. */
2025	 assert(this->frag_depth);
2026	 fs_reg depth = *(variable_storage(this->frag_depth));
2027
2028	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth));
2029      } else {
2030	 /* Pass through the payload depth. */
2031	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2032		      fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
2033      }
2034   }
2035
2036   if (c->dest_depth_reg) {
2037      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2038		   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
2039   }
2040
2041   fs_reg color = reg_undef;
2042   if (this->frag_color)
2043      color = *(variable_storage(this->frag_color));
2044   else if (this->frag_data) {
2045      color = *(variable_storage(this->frag_data));
2046      color.type = BRW_REGISTER_TYPE_F;
2047   }
2048
2049   for (int target = 0; target < c->key.nr_color_regions; target++) {
2050      this->current_annotation = talloc_asprintf(this->mem_ctx,
2051						 "FB write target %d",
2052						 target);
2053      if (this->frag_color || this->frag_data) {
2054	 for (int i = 0; i < 4; i++) {
2055	    emit(fs_inst(BRW_OPCODE_MOV,
2056			 fs_reg(MRF, color_mrf + i),
2057			 color));
2058	    color.reg_offset++;
2059	 }
2060      }
2061
2062      if (this->frag_color)
2063	 color.reg_offset -= 4;
2064
2065      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
2066				   reg_undef, reg_undef));
2067      inst->target = target;
2068      inst->base_mrf = 0;
2069      inst->mlen = nr;
2070      if (target == c->key.nr_color_regions - 1)
2071	 inst->eot = true;
2072      inst->header_present = header_present;
2073   }
2074
2075   if (c->key.nr_color_regions == 0) {
2076      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
2077				   reg_undef, reg_undef));
2078      inst->base_mrf = 0;
2079      inst->mlen = nr;
2080      inst->eot = true;
2081      inst->header_present = header_present;
2082   }
2083
2084   this->current_annotation = NULL;
2085}
2086
2087void
2088fs_visitor::generate_fb_write(fs_inst *inst)
2089{
2090   GLboolean eot = inst->eot;
2091   struct brw_reg implied_header;
2092
2093   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2094    * move, here's g1.
2095    */
2096   brw_push_insn_state(p);
2097   brw_set_mask_control(p, BRW_MASK_DISABLE);
2098   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2099
2100   if (inst->header_present) {
2101      if (intel->gen >= 6) {
2102	 brw_MOV(p,
2103		 brw_message_reg(inst->base_mrf),
2104		 brw_vec8_grf(0, 0));
2105
2106	 if (inst->target > 0) {
2107	    /* Set the render target index for choosing BLEND_STATE. */
2108	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2109			      BRW_REGISTER_TYPE_UD),
2110		    brw_imm_ud(inst->target));
2111	 }
2112
2113	 /* Clear viewport index, render target array index. */
2114	 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2115			   BRW_REGISTER_TYPE_UD),
2116		 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2117		 brw_imm_ud(0xf7ff));
2118
2119	 implied_header = brw_null_reg();
2120      } else {
2121	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2122      }
2123
2124      brw_MOV(p,
2125	      brw_message_reg(inst->base_mrf + 1),
2126	      brw_vec8_grf(1, 0));
2127   } else {
2128      implied_header = brw_null_reg();
2129   }
2130
2131   brw_pop_insn_state(p);
2132
2133   brw_fb_WRITE(p,
2134		8, /* dispatch_width */
2135		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
2136		inst->base_mrf,
2137		implied_header,
2138		inst->target,
2139		inst->mlen,
2140		0,
2141		eot,
2142		inst->header_present);
2143}
2144
2145void
2146fs_visitor::generate_linterp(fs_inst *inst,
2147			     struct brw_reg dst, struct brw_reg *src)
2148{
2149   struct brw_reg delta_x = src[0];
2150   struct brw_reg delta_y = src[1];
2151   struct brw_reg interp = src[2];
2152
2153   if (brw->has_pln &&
2154       delta_y.nr == delta_x.nr + 1 &&
2155       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2156      brw_PLN(p, dst, interp, delta_x);
2157   } else {
2158      brw_LINE(p, brw_null_reg(), interp, delta_x);
2159      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2160   }
2161}
2162
2163void
2164fs_visitor::generate_math(fs_inst *inst,
2165			  struct brw_reg dst, struct brw_reg *src)
2166{
2167   int op;
2168
2169   switch (inst->opcode) {
2170   case FS_OPCODE_RCP:
2171      op = BRW_MATH_FUNCTION_INV;
2172      break;
2173   case FS_OPCODE_RSQ:
2174      op = BRW_MATH_FUNCTION_RSQ;
2175      break;
2176   case FS_OPCODE_SQRT:
2177      op = BRW_MATH_FUNCTION_SQRT;
2178      break;
2179   case FS_OPCODE_EXP2:
2180      op = BRW_MATH_FUNCTION_EXP;
2181      break;
2182   case FS_OPCODE_LOG2:
2183      op = BRW_MATH_FUNCTION_LOG;
2184      break;
2185   case FS_OPCODE_POW:
2186      op = BRW_MATH_FUNCTION_POW;
2187      break;
2188   case FS_OPCODE_SIN:
2189      op = BRW_MATH_FUNCTION_SIN;
2190      break;
2191   case FS_OPCODE_COS:
2192      op = BRW_MATH_FUNCTION_COS;
2193      break;
2194   default:
2195      assert(!"not reached: unknown math function");
2196      op = 0;
2197      break;
2198   }
2199
2200   if (intel->gen >= 6) {
2201      assert(inst->mlen == 0);
2202
2203      if (inst->opcode == FS_OPCODE_POW) {
2204	 brw_math2(p, dst, op, src[0], src[1]);
2205      } else {
2206	 brw_math(p, dst,
2207		  op,
2208		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2209		  BRW_MATH_SATURATE_NONE,
2210		  0, src[0],
2211		  BRW_MATH_DATA_VECTOR,
2212		  BRW_MATH_PRECISION_FULL);
2213      }
2214   } else {
2215      assert(inst->mlen >= 1);
2216
2217      brw_math(p, dst,
2218	       op,
2219	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2220	       BRW_MATH_SATURATE_NONE,
2221	       inst->base_mrf, src[0],
2222	       BRW_MATH_DATA_VECTOR,
2223	       BRW_MATH_PRECISION_FULL);
2224   }
2225}
2226
2227void
2228fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst)
2229{
2230   int msg_type = -1;
2231   int rlen = 4;
2232   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2233
2234   if (intel->gen >= 5) {
2235      switch (inst->opcode) {
2236      case FS_OPCODE_TEX:
2237	 if (inst->shadow_compare) {
2238	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
2239	 } else {
2240	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
2241	 }
2242	 break;
2243      case FS_OPCODE_TXB:
2244	 if (inst->shadow_compare) {
2245	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE_GEN5;
2246	 } else {
2247	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
2248	 }
2249	 break;
2250      }
2251   } else {
2252      switch (inst->opcode) {
2253      case FS_OPCODE_TEX:
2254	 /* Note that G45 and older determines shadow compare and dispatch width
2255	  * from message length for most messages.
2256	  */
2257	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2258	 if (inst->shadow_compare) {
2259	    assert(inst->mlen == 6);
2260	 } else {
2261	    assert(inst->mlen <= 4);
2262	 }
2263	 break;
2264      case FS_OPCODE_TXB:
2265	 if (inst->shadow_compare) {
2266	    assert(inst->mlen == 6);
2267	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2268	 } else {
2269	    assert(inst->mlen == 9);
2270	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2271	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2272	 }
2273	 break;
2274      }
2275   }
2276   assert(msg_type != -1);
2277
2278   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2279      rlen = 8;
2280      dst = vec16(dst);
2281   }
2282
2283   brw_SAMPLE(p,
2284	      retype(dst, BRW_REGISTER_TYPE_UW),
2285	      inst->base_mrf,
2286	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
2287              SURF_INDEX_TEXTURE(inst->sampler),
2288	      inst->sampler,
2289	      WRITEMASK_XYZW,
2290	      msg_type,
2291	      rlen,
2292	      inst->mlen,
2293	      0,
2294	      1,
2295	      simd_mode);
2296}
2297
2298
2299/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2300 * looking like:
2301 *
2302 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2303 *
2304 * and we're trying to produce:
2305 *
2306 *           DDX                     DDY
2307 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2308 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2309 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2310 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2311 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2312 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2313 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2314 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2315 *
2316 * and add another set of two more subspans if in 16-pixel dispatch mode.
2317 *
2318 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2319 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2320 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2321 * between each other.  We could probably do it like ddx and swizzle the right
2322 * order later, but bail for now and just produce
2323 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2324 */
2325void
2326fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2327{
2328   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2329				 BRW_REGISTER_TYPE_F,
2330				 BRW_VERTICAL_STRIDE_2,
2331				 BRW_WIDTH_2,
2332				 BRW_HORIZONTAL_STRIDE_0,
2333				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2334   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2335				 BRW_REGISTER_TYPE_F,
2336				 BRW_VERTICAL_STRIDE_2,
2337				 BRW_WIDTH_2,
2338				 BRW_HORIZONTAL_STRIDE_0,
2339				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2340   brw_ADD(p, dst, src0, negate(src1));
2341}
2342
2343void
2344fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2345{
2346   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2347				 BRW_REGISTER_TYPE_F,
2348				 BRW_VERTICAL_STRIDE_4,
2349				 BRW_WIDTH_4,
2350				 BRW_HORIZONTAL_STRIDE_0,
2351				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2352   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2353				 BRW_REGISTER_TYPE_F,
2354				 BRW_VERTICAL_STRIDE_4,
2355				 BRW_WIDTH_4,
2356				 BRW_HORIZONTAL_STRIDE_0,
2357				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2358   brw_ADD(p, dst, src0, negate(src1));
2359}
2360
2361void
2362fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2363{
2364   if (intel->gen >= 6) {
2365      /* Gen6 no longer has the mask reg for us to just read the
2366       * active channels from.  However, cmp updates just the channels
2367       * of the flag reg that are enabled, so we can get at the
2368       * channel enables that way.  In this step, make a reg of ones
2369       * we'll compare to.
2370       */
2371      brw_MOV(p, mask, brw_imm_ud(1));
2372   } else {
2373      brw_push_insn_state(p);
2374      brw_set_mask_control(p, BRW_MASK_DISABLE);
2375      brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2376      brw_pop_insn_state(p);
2377   }
2378}
2379
2380void
2381fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2382{
2383   if (intel->gen >= 6) {
2384      struct brw_reg f0 = brw_flag_reg();
2385      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2386
2387      brw_push_insn_state(p);
2388      brw_set_mask_control(p, BRW_MASK_DISABLE);
2389      brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2390      brw_pop_insn_state(p);
2391
2392      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2393	      BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2394      /* Undo CMP's whacking of predication*/
2395      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2396
2397      brw_push_insn_state(p);
2398      brw_set_mask_control(p, BRW_MASK_DISABLE);
2399      brw_AND(p, g1, f0, g1);
2400      brw_pop_insn_state(p);
2401   } else {
2402      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2403
2404      mask = brw_uw1_reg(mask.file, mask.nr, 0);
2405
2406      brw_push_insn_state(p);
2407      brw_set_mask_control(p, BRW_MASK_DISABLE);
2408      brw_AND(p, g0, mask, g0);
2409      brw_pop_insn_state(p);
2410   }
2411}
2412
2413void
2414fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2415{
2416   assert(inst->mlen != 0);
2417
2418   brw_MOV(p,
2419	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2420	   retype(src, BRW_REGISTER_TYPE_UD));
2421   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2422				 inst->offset);
2423}
2424
2425void
2426fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2427{
2428   assert(inst->mlen != 0);
2429
2430   /* Clear any post destination dependencies that would be ignored by
2431    * the block read.  See the B-Spec for pre-gen5 send instruction.
2432    *
2433    * This could use a better solution, since texture sampling and
2434    * math reads could potentially run into it as well -- anywhere
2435    * that we have a SEND with a destination that is a register that
2436    * was written but not read within the last N instructions (what's
2437    * N?  unsure).  This is rare because of dead code elimination, but
2438    * not impossible.
2439    */
2440   if (intel->gen == 4 && !intel->is_g4x)
2441      brw_MOV(p, brw_null_reg(), dst);
2442
2443   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2444				inst->offset);
2445
2446   if (intel->gen == 4 && !intel->is_g4x) {
2447      /* gen4 errata: destination from a send can't be used as a
2448       * destination until it's been read.  Just read it so we don't
2449       * have to worry.
2450       */
2451      brw_MOV(p, brw_null_reg(), dst);
2452   }
2453}
2454
2455
2456void
2457fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2458{
2459   assert(inst->mlen != 0);
2460
2461   /* Clear any post destination dependencies that would be ignored by
2462    * the block read.  See the B-Spec for pre-gen5 send instruction.
2463    *
2464    * This could use a better solution, since texture sampling and
2465    * math reads could potentially run into it as well -- anywhere
2466    * that we have a SEND with a destination that is a register that
2467    * was written but not read within the last N instructions (what's
2468    * N?  unsure).  This is rare because of dead code elimination, but
2469    * not impossible.
2470    */
2471   if (intel->gen == 4 && !intel->is_g4x)
2472      brw_MOV(p, brw_null_reg(), dst);
2473
2474   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2475			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2476
2477   if (intel->gen == 4 && !intel->is_g4x) {
2478      /* gen4 errata: destination from a send can't be used as a
2479       * destination until it's been read.  Just read it so we don't
2480       * have to worry.
2481       */
2482      brw_MOV(p, brw_null_reg(), dst);
2483   }
2484}
2485
2486void
2487fs_visitor::assign_curb_setup()
2488{
2489   c->prog_data.first_curbe_grf = c->nr_payload_regs;
2490   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2491
2492   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2493   foreach_iter(exec_list_iterator, iter, this->instructions) {
2494      fs_inst *inst = (fs_inst *)iter.get();
2495
2496      for (unsigned int i = 0; i < 3; i++) {
2497	 if (inst->src[i].file == UNIFORM) {
2498	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2499	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2500						  constant_nr / 8,
2501						  constant_nr % 8);
2502
2503	    inst->src[i].file = FIXED_HW_REG;
2504	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2505	 }
2506      }
2507   }
2508}
2509
2510void
2511fs_visitor::calculate_urb_setup()
2512{
2513   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2514      urb_setup[i] = -1;
2515   }
2516
2517   int urb_next = 0;
2518   /* Figure out where each of the incoming setup attributes lands. */
2519   if (intel->gen >= 6) {
2520      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2521	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2522	    urb_setup[i] = urb_next++;
2523	 }
2524      }
2525   } else {
2526      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2527      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2528	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2529	    int fp_index;
2530
2531	    if (i >= VERT_RESULT_VAR0)
2532	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2533	    else if (i <= VERT_RESULT_TEX7)
2534	       fp_index = i;
2535	    else
2536	       fp_index = -1;
2537
2538	    if (fp_index >= 0)
2539	       urb_setup[fp_index] = urb_next++;
2540	 }
2541      }
2542   }
2543
2544   /* Each attribute is 4 setup channels, each of which is half a reg. */
2545   c->prog_data.urb_read_length = urb_next * 2;
2546}
2547
2548void
2549fs_visitor::assign_urb_setup()
2550{
2551   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2552
2553   /* Offset all the urb_setup[] index by the actual position of the
2554    * setup regs, now that the location of the constants has been chosen.
2555    */
2556   foreach_iter(exec_list_iterator, iter, this->instructions) {
2557      fs_inst *inst = (fs_inst *)iter.get();
2558
2559      if (inst->opcode != FS_OPCODE_LINTERP)
2560	 continue;
2561
2562      assert(inst->src[2].file == FIXED_HW_REG);
2563
2564      inst->src[2].fixed_hw_reg.nr += urb_start;
2565   }
2566
2567   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2568}
2569
2570/**
2571 * Split large virtual GRFs into separate components if we can.
2572 *
2573 * This is mostly duplicated with what brw_fs_vector_splitting does,
2574 * but that's really conservative because it's afraid of doing
2575 * splitting that doesn't result in real progress after the rest of
2576 * the optimization phases, which would cause infinite looping in
2577 * optimization.  We can do it once here, safely.  This also has the
2578 * opportunity to split interpolated values, or maybe even uniforms,
2579 * which we don't have at the IR level.
2580 *
2581 * We want to split, because virtual GRFs are what we register
2582 * allocate and spill (due to contiguousness requirements for some
2583 * instructions), and they're what we naturally generate in the
2584 * codegen process, but most virtual GRFs don't actually need to be
2585 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2586 * live intervals and better dead code elimination and coalescing.
2587 */
2588void
2589fs_visitor::split_virtual_grfs()
2590{
2591   int num_vars = this->virtual_grf_next;
2592   bool split_grf[num_vars];
2593   int new_virtual_grf[num_vars];
2594
2595   /* Try to split anything > 0 sized. */
2596   for (int i = 0; i < num_vars; i++) {
2597      if (this->virtual_grf_sizes[i] != 1)
2598	 split_grf[i] = true;
2599      else
2600	 split_grf[i] = false;
2601   }
2602
2603   if (brw->has_pln) {
2604      /* PLN opcodes rely on the delta_xy being contiguous. */
2605      split_grf[this->delta_x.reg] = false;
2606   }
2607
2608   foreach_iter(exec_list_iterator, iter, this->instructions) {
2609      fs_inst *inst = (fs_inst *)iter.get();
2610
2611      /* Texturing produces 4 contiguous registers, so no splitting. */
2612      if ((inst->opcode == FS_OPCODE_TEX ||
2613	   inst->opcode == FS_OPCODE_TXB ||
2614	   inst->opcode == FS_OPCODE_TXL) &&
2615	  inst->dst.file == GRF) {
2616	 split_grf[inst->dst.reg] = false;
2617      }
2618   }
2619
2620   /* Allocate new space for split regs.  Note that the virtual
2621    * numbers will be contiguous.
2622    */
2623   for (int i = 0; i < num_vars; i++) {
2624      if (split_grf[i]) {
2625	 new_virtual_grf[i] = virtual_grf_alloc(1);
2626	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2627	    int reg = virtual_grf_alloc(1);
2628	    assert(reg == new_virtual_grf[i] + j - 1);
2629	    (void) reg;
2630	 }
2631	 this->virtual_grf_sizes[i] = 1;
2632      }
2633   }
2634
2635   foreach_iter(exec_list_iterator, iter, this->instructions) {
2636      fs_inst *inst = (fs_inst *)iter.get();
2637
2638      if (inst->dst.file == GRF &&
2639	  split_grf[inst->dst.reg] &&
2640	  inst->dst.reg_offset != 0) {
2641	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2642			  inst->dst.reg_offset - 1);
2643	 inst->dst.reg_offset = 0;
2644      }
2645      for (int i = 0; i < 3; i++) {
2646	 if (inst->src[i].file == GRF &&
2647	     split_grf[inst->src[i].reg] &&
2648	     inst->src[i].reg_offset != 0) {
2649	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2650				inst->src[i].reg_offset - 1);
2651	    inst->src[i].reg_offset = 0;
2652	 }
2653      }
2654   }
2655}
2656
2657/**
2658 * Choose accesses from the UNIFORM file to demote to using the pull
2659 * constant buffer.
2660 *
2661 * We allow a fragment shader to have more than the specified minimum
2662 * maximum number of fragment shader uniform components (64).  If
2663 * there are too many of these, they'd fill up all of register space.
2664 * So, this will push some of them out to the pull constant buffer and
2665 * update the program to load them.
2666 */
2667void
2668fs_visitor::setup_pull_constants()
2669{
2670   /* Only allow 16 registers (128 uniform components) as push constants. */
2671   unsigned int max_uniform_components = 16 * 8;
2672   if (c->prog_data.nr_params <= max_uniform_components)
2673      return;
2674
2675   /* Just demote the end of the list.  We could probably do better
2676    * here, demoting things that are rarely used in the program first.
2677    */
2678   int pull_uniform_base = max_uniform_components;
2679   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2680
2681   foreach_iter(exec_list_iterator, iter, this->instructions) {
2682      fs_inst *inst = (fs_inst *)iter.get();
2683
2684      for (int i = 0; i < 3; i++) {
2685	 if (inst->src[i].file != UNIFORM)
2686	    continue;
2687
2688	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2689	 if (uniform_nr < pull_uniform_base)
2690	    continue;
2691
2692	 fs_reg dst = fs_reg(this, glsl_type::float_type);
2693	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2694					      dst);
2695	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2696	 pull->ir = inst->ir;
2697	 pull->annotation = inst->annotation;
2698	 pull->base_mrf = 14;
2699	 pull->mlen = 1;
2700
2701	 inst->insert_before(pull);
2702
2703	 inst->src[i].file = GRF;
2704	 inst->src[i].reg = dst.reg;
2705	 inst->src[i].reg_offset = 0;
2706	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2707      }
2708   }
2709
2710   for (int i = 0; i < pull_uniform_count; i++) {
2711      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2712      c->prog_data.pull_param_convert[i] =
2713	 c->prog_data.param_convert[pull_uniform_base + i];
2714   }
2715   c->prog_data.nr_params -= pull_uniform_count;
2716   c->prog_data.nr_pull_params = pull_uniform_count;
2717}
2718
2719void
2720fs_visitor::calculate_live_intervals()
2721{
2722   int num_vars = this->virtual_grf_next;
2723   int *def = talloc_array(mem_ctx, int, num_vars);
2724   int *use = talloc_array(mem_ctx, int, num_vars);
2725   int loop_depth = 0;
2726   int loop_start = 0;
2727   int bb_header_ip = 0;
2728
2729   for (int i = 0; i < num_vars; i++) {
2730      def[i] = 1 << 30;
2731      use[i] = -1;
2732   }
2733
2734   int ip = 0;
2735   foreach_iter(exec_list_iterator, iter, this->instructions) {
2736      fs_inst *inst = (fs_inst *)iter.get();
2737
2738      if (inst->opcode == BRW_OPCODE_DO) {
2739	 if (loop_depth++ == 0)
2740	    loop_start = ip;
2741      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2742	 loop_depth--;
2743
2744	 if (loop_depth == 0) {
2745	    /* Patches up the use of vars marked for being live across
2746	     * the whole loop.
2747	     */
2748	    for (int i = 0; i < num_vars; i++) {
2749	       if (use[i] == loop_start) {
2750		  use[i] = ip;
2751	       }
2752	    }
2753	 }
2754      } else {
2755	 for (unsigned int i = 0; i < 3; i++) {
2756	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2757	       int reg = inst->src[i].reg;
2758
2759	       if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2760				   def[reg] >= bb_header_ip)) {
2761		  use[reg] = ip;
2762	       } else {
2763		  def[reg] = MIN2(loop_start, def[reg]);
2764		  use[reg] = loop_start;
2765
2766		  /* Nobody else is going to go smash our start to
2767		   * later in the loop now, because def[reg] now
2768		   * points before the bb header.
2769		   */
2770	       }
2771	    }
2772	 }
2773	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2774	    int reg = inst->dst.reg;
2775
2776	    if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2777				!inst->predicated)) {
2778	       def[reg] = MIN2(def[reg], ip);
2779	    } else {
2780	       def[reg] = MIN2(def[reg], loop_start);
2781	    }
2782	 }
2783      }
2784
2785      ip++;
2786
2787      /* Set the basic block header IP.  This is used for determining
2788       * if a complete def of single-register virtual GRF in a loop
2789       * dominates a use in the same basic block.  It's a quick way to
2790       * reduce the live interval range of most register used in a
2791       * loop.
2792       */
2793      if (inst->opcode == BRW_OPCODE_IF ||
2794	  inst->opcode == BRW_OPCODE_ELSE ||
2795	  inst->opcode == BRW_OPCODE_ENDIF ||
2796	  inst->opcode == BRW_OPCODE_DO ||
2797	  inst->opcode == BRW_OPCODE_WHILE ||
2798	  inst->opcode == BRW_OPCODE_BREAK ||
2799	  inst->opcode == BRW_OPCODE_CONTINUE) {
2800	 bb_header_ip = ip;
2801      }
2802   }
2803
2804   talloc_free(this->virtual_grf_def);
2805   talloc_free(this->virtual_grf_use);
2806   this->virtual_grf_def = def;
2807   this->virtual_grf_use = use;
2808}
2809
2810/**
2811 * Attempts to move immediate constants into the immediate
2812 * constant slot of following instructions.
2813 *
2814 * Immediate constants are a bit tricky -- they have to be in the last
2815 * operand slot, you can't do abs/negate on them,
2816 */
2817
2818bool
2819fs_visitor::propagate_constants()
2820{
2821   bool progress = false;
2822
2823   foreach_iter(exec_list_iterator, iter, this->instructions) {
2824      fs_inst *inst = (fs_inst *)iter.get();
2825
2826      if (inst->opcode != BRW_OPCODE_MOV ||
2827	  inst->predicated ||
2828	  inst->dst.file != GRF || inst->src[0].file != IMM ||
2829	  inst->dst.type != inst->src[0].type)
2830	 continue;
2831
2832      /* Don't bother with cases where we should have had the
2833       * operation on the constant folded in GLSL already.
2834       */
2835      if (inst->saturate)
2836	 continue;
2837
2838      /* Found a move of a constant to a GRF.  Find anything else using the GRF
2839       * before it's written, and replace it with the constant if we can.
2840       */
2841      exec_list_iterator scan_iter = iter;
2842      scan_iter.next();
2843      for (; scan_iter.has_next(); scan_iter.next()) {
2844	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2845
2846	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2847	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2848	     scan_inst->opcode == BRW_OPCODE_ELSE ||
2849	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2850	    break;
2851	 }
2852
2853	 for (int i = 2; i >= 0; i--) {
2854	    if (scan_inst->src[i].file != GRF ||
2855		scan_inst->src[i].reg != inst->dst.reg ||
2856		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2857	       continue;
2858
2859	    /* Don't bother with cases where we should have had the
2860	     * operation on the constant folded in GLSL already.
2861	     */
2862	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2863	       continue;
2864
2865	    switch (scan_inst->opcode) {
2866	    case BRW_OPCODE_MOV:
2867	       scan_inst->src[i] = inst->src[0];
2868	       progress = true;
2869	       break;
2870
2871	    case BRW_OPCODE_MUL:
2872	    case BRW_OPCODE_ADD:
2873	       if (i == 1) {
2874		  scan_inst->src[i] = inst->src[0];
2875		  progress = true;
2876	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
2877		  /* Fit this constant in by commuting the operands */
2878		  scan_inst->src[0] = scan_inst->src[1];
2879		  scan_inst->src[1] = inst->src[0];
2880	       }
2881	       break;
2882	    case BRW_OPCODE_CMP:
2883	    case BRW_OPCODE_SEL:
2884	       if (i == 1) {
2885		  scan_inst->src[i] = inst->src[0];
2886		  progress = true;
2887	       }
2888	    }
2889	 }
2890
2891	 if (scan_inst->dst.file == GRF &&
2892	     scan_inst->dst.reg == inst->dst.reg &&
2893	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2894	      scan_inst->opcode == FS_OPCODE_TEX)) {
2895	    break;
2896	 }
2897      }
2898   }
2899
2900   return progress;
2901}
2902/**
2903 * Must be called after calculate_live_intervales() to remove unused
2904 * writes to registers -- register allocation will fail otherwise
2905 * because something deffed but not used won't be considered to
2906 * interfere with other regs.
2907 */
2908bool
2909fs_visitor::dead_code_eliminate()
2910{
2911   bool progress = false;
2912   int pc = 0;
2913
2914   foreach_iter(exec_list_iterator, iter, this->instructions) {
2915      fs_inst *inst = (fs_inst *)iter.get();
2916
2917      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
2918	 inst->remove();
2919	 progress = true;
2920      }
2921
2922      pc++;
2923   }
2924
2925   return progress;
2926}
2927
2928bool
2929fs_visitor::register_coalesce()
2930{
2931   bool progress = false;
2932
2933   foreach_iter(exec_list_iterator, iter, this->instructions) {
2934      fs_inst *inst = (fs_inst *)iter.get();
2935
2936      if (inst->opcode != BRW_OPCODE_MOV ||
2937	  inst->predicated ||
2938	  inst->saturate ||
2939	  inst->dst.file != GRF || inst->src[0].file != GRF ||
2940	  inst->dst.type != inst->src[0].type)
2941	 continue;
2942
2943      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2944       * them: check for no writes to either one until the exit of the
2945       * program.
2946       */
2947      bool interfered = false;
2948      exec_list_iterator scan_iter = iter;
2949      scan_iter.next();
2950      for (; scan_iter.has_next(); scan_iter.next()) {
2951	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2952
2953	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2954	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2955	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2956	    interfered = true;
2957	    iter = scan_iter;
2958	    break;
2959	 }
2960
2961	 if (scan_inst->dst.file == GRF) {
2962	    if (scan_inst->dst.reg == inst->dst.reg &&
2963		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2964		 scan_inst->opcode == FS_OPCODE_TEX)) {
2965	       interfered = true;
2966	       break;
2967	    }
2968	    if (scan_inst->dst.reg == inst->src[0].reg &&
2969		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
2970		 scan_inst->opcode == FS_OPCODE_TEX)) {
2971	       interfered = true;
2972	       break;
2973	    }
2974	 }
2975      }
2976      if (interfered) {
2977	 continue;
2978      }
2979
2980      /* Update live interval so we don't have to recalculate. */
2981      this->virtual_grf_use[inst->src[0].reg] = MAX2(virtual_grf_use[inst->src[0].reg],
2982						     virtual_grf_use[inst->dst.reg]);
2983
2984      /* Rewrite the later usage to point at the source of the move to
2985       * be removed.
2986       */
2987      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
2988	   scan_iter.next()) {
2989	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2990
2991	 for (int i = 0; i < 3; i++) {
2992	    if (scan_inst->src[i].file == GRF &&
2993		scan_inst->src[i].reg == inst->dst.reg &&
2994		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2995	       scan_inst->src[i].reg = inst->src[0].reg;
2996	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
2997	       scan_inst->src[i].abs |= inst->src[0].abs;
2998	       scan_inst->src[i].negate ^= inst->src[0].negate;
2999	       scan_inst->src[i].smear = inst->src[0].smear;
3000	    }
3001	 }
3002      }
3003
3004      inst->remove();
3005      progress = true;
3006   }
3007
3008   return progress;
3009}
3010
3011
3012bool
3013fs_visitor::compute_to_mrf()
3014{
3015   bool progress = false;
3016   int next_ip = 0;
3017
3018   foreach_iter(exec_list_iterator, iter, this->instructions) {
3019      fs_inst *inst = (fs_inst *)iter.get();
3020
3021      int ip = next_ip;
3022      next_ip++;
3023
3024      if (inst->opcode != BRW_OPCODE_MOV ||
3025	  inst->predicated ||
3026	  inst->dst.file != MRF || inst->src[0].file != GRF ||
3027	  inst->dst.type != inst->src[0].type ||
3028	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3029	 continue;
3030
3031      /* Can't compute-to-MRF this GRF if someone else was going to
3032       * read it later.
3033       */
3034      if (this->virtual_grf_use[inst->src[0].reg] > ip)
3035	 continue;
3036
3037      /* Found a move of a GRF to a MRF.  Let's see if we can go
3038       * rewrite the thing that made this GRF to write into the MRF.
3039       */
3040      fs_inst *scan_inst;
3041      for (scan_inst = (fs_inst *)inst->prev;
3042	   scan_inst->prev != NULL;
3043	   scan_inst = (fs_inst *)scan_inst->prev) {
3044	 if (scan_inst->dst.file == GRF &&
3045	     scan_inst->dst.reg == inst->src[0].reg) {
3046	    /* Found the last thing to write our reg we want to turn
3047	     * into a compute-to-MRF.
3048	     */
3049
3050	    if (scan_inst->opcode == FS_OPCODE_TEX) {
3051	       /* texturing writes several continuous regs, so we can't
3052		* compute-to-mrf that.
3053		*/
3054	       break;
3055	    }
3056
3057	    /* If it's predicated, it (probably) didn't populate all
3058	     * the channels.
3059	     */
3060	    if (scan_inst->predicated)
3061	       break;
3062
3063	    /* SEND instructions can't have MRF as a destination. */
3064	    if (scan_inst->mlen)
3065	       break;
3066
3067	    if (intel->gen >= 6) {
3068	       /* gen6 math instructions must have the destination be
3069		* GRF, so no compute-to-MRF for them.
3070		*/
3071	       if (scan_inst->opcode == FS_OPCODE_RCP ||
3072		   scan_inst->opcode == FS_OPCODE_RSQ ||
3073		   scan_inst->opcode == FS_OPCODE_SQRT ||
3074		   scan_inst->opcode == FS_OPCODE_EXP2 ||
3075		   scan_inst->opcode == FS_OPCODE_LOG2 ||
3076		   scan_inst->opcode == FS_OPCODE_SIN ||
3077		   scan_inst->opcode == FS_OPCODE_COS ||
3078		   scan_inst->opcode == FS_OPCODE_POW) {
3079		  break;
3080	       }
3081	    }
3082
3083	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3084	       /* Found the creator of our MRF's source value. */
3085	       scan_inst->dst.file = MRF;
3086	       scan_inst->dst.hw_reg = inst->dst.hw_reg;
3087	       scan_inst->saturate |= inst->saturate;
3088	       inst->remove();
3089	       progress = true;
3090	    }
3091	    break;
3092	 }
3093
3094	 /* We don't handle flow control here.  Most computation of
3095	  * values that end up in MRFs are shortly before the MRF
3096	  * write anyway.
3097	  */
3098	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3099	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3100	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3101	    break;
3102	 }
3103
3104	 /* You can't read from an MRF, so if someone else reads our
3105	  * MRF's source GRF that we wanted to rewrite, that stops us.
3106	  */
3107	 bool interfered = false;
3108	 for (int i = 0; i < 3; i++) {
3109	    if (scan_inst->src[i].file == GRF &&
3110		scan_inst->src[i].reg == inst->src[0].reg &&
3111		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3112	       interfered = true;
3113	    }
3114	 }
3115	 if (interfered)
3116	    break;
3117
3118	 if (scan_inst->dst.file == MRF &&
3119	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3120	    /* Somebody else wrote our MRF here, so we can't can't
3121	     * compute-to-MRF before that.
3122	     */
3123	    break;
3124	 }
3125
3126	 if (scan_inst->mlen > 0) {
3127	    /* Found a SEND instruction, which means that there are
3128	     * live values in MRFs from base_mrf to base_mrf +
3129	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3130	     * above it.
3131	     */
3132	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3133		inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3134	       break;
3135	    }
3136	 }
3137      }
3138   }
3139
3140   return progress;
3141}
3142
3143/**
3144 * Walks through basic blocks, locking for repeated MRF writes and
3145 * removing the later ones.
3146 */
3147bool
3148fs_visitor::remove_duplicate_mrf_writes()
3149{
3150   fs_inst *last_mrf_move[16];
3151   bool progress = false;
3152
3153   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3154
3155   foreach_iter(exec_list_iterator, iter, this->instructions) {
3156      fs_inst *inst = (fs_inst *)iter.get();
3157
3158      switch (inst->opcode) {
3159      case BRW_OPCODE_DO:
3160      case BRW_OPCODE_WHILE:
3161      case BRW_OPCODE_IF:
3162      case BRW_OPCODE_ELSE:
3163      case BRW_OPCODE_ENDIF:
3164	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3165	 continue;
3166      default:
3167	 break;
3168      }
3169
3170      if (inst->opcode == BRW_OPCODE_MOV &&
3171	  inst->dst.file == MRF) {
3172	 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3173	 if (prev_inst && inst->equals(prev_inst)) {
3174	    inst->remove();
3175	    progress = true;
3176	    continue;
3177	 }
3178      }
3179
3180      /* Clear out the last-write records for MRFs that were overwritten. */
3181      if (inst->dst.file == MRF) {
3182	 last_mrf_move[inst->dst.hw_reg] = NULL;
3183      }
3184
3185      if (inst->mlen > 0) {
3186	 /* Found a SEND instruction, which will include two of fewer
3187	  * implied MRF writes.  We could do better here.
3188	  */
3189	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3190	    last_mrf_move[inst->base_mrf + i] = NULL;
3191	 }
3192      }
3193
3194      /* Clear out any MRF move records whose sources got overwritten. */
3195      if (inst->dst.file == GRF) {
3196	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3197	    if (last_mrf_move[i] &&
3198		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3199	       last_mrf_move[i] = NULL;
3200	    }
3201	 }
3202      }
3203
3204      if (inst->opcode == BRW_OPCODE_MOV &&
3205	  inst->dst.file == MRF &&
3206	  inst->src[0].file == GRF &&
3207	  !inst->predicated) {
3208	 last_mrf_move[inst->dst.hw_reg] = inst;
3209      }
3210   }
3211
3212   return progress;
3213}
3214
3215bool
3216fs_visitor::virtual_grf_interferes(int a, int b)
3217{
3218   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3219   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3220
3221   /* For dead code, just check if the def interferes with the other range. */
3222   if (this->virtual_grf_use[a] == -1) {
3223      return (this->virtual_grf_def[a] >= this->virtual_grf_def[b] &&
3224	      this->virtual_grf_def[a] < this->virtual_grf_use[b]);
3225   }
3226   if (this->virtual_grf_use[b] == -1) {
3227      return (this->virtual_grf_def[b] >= this->virtual_grf_def[a] &&
3228	      this->virtual_grf_def[b] < this->virtual_grf_use[a]);
3229   }
3230
3231   return start < end;
3232}
3233
3234static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3235{
3236   struct brw_reg brw_reg;
3237
3238   switch (reg->file) {
3239   case GRF:
3240   case ARF:
3241   case MRF:
3242      if (reg->smear == -1) {
3243	 brw_reg = brw_vec8_reg(reg->file,
3244				reg->hw_reg, 0);
3245      } else {
3246	 brw_reg = brw_vec1_reg(reg->file,
3247				reg->hw_reg, reg->smear);
3248      }
3249      brw_reg = retype(brw_reg, reg->type);
3250      break;
3251   case IMM:
3252      switch (reg->type) {
3253      case BRW_REGISTER_TYPE_F:
3254	 brw_reg = brw_imm_f(reg->imm.f);
3255	 break;
3256      case BRW_REGISTER_TYPE_D:
3257	 brw_reg = brw_imm_d(reg->imm.i);
3258	 break;
3259      case BRW_REGISTER_TYPE_UD:
3260	 brw_reg = brw_imm_ud(reg->imm.u);
3261	 break;
3262      default:
3263	 assert(!"not reached");
3264	 brw_reg = brw_null_reg();
3265	 break;
3266      }
3267      break;
3268   case FIXED_HW_REG:
3269      brw_reg = reg->fixed_hw_reg;
3270      break;
3271   case BAD_FILE:
3272      /* Probably unused. */
3273      brw_reg = brw_null_reg();
3274      break;
3275   case UNIFORM:
3276      assert(!"not reached");
3277      brw_reg = brw_null_reg();
3278      break;
3279   default:
3280      assert(!"not reached");
3281      brw_reg = brw_null_reg();
3282      break;
3283   }
3284   if (reg->abs)
3285      brw_reg = brw_abs(brw_reg);
3286   if (reg->negate)
3287      brw_reg = negate(brw_reg);
3288
3289   return brw_reg;
3290}
3291
3292void
3293fs_visitor::generate_code()
3294{
3295   int last_native_inst = 0;
3296   struct brw_instruction *if_stack[16], *loop_stack[16];
3297   int if_stack_depth = 0, loop_stack_depth = 0;
3298   int if_depth_in_loop[16];
3299   const char *last_annotation_string = NULL;
3300   ir_instruction *last_annotation_ir = NULL;
3301
3302   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3303      printf("Native code for fragment shader %d:\n",
3304	     ctx->Shader.CurrentFragmentProgram->Name);
3305   }
3306
3307   if_depth_in_loop[loop_stack_depth] = 0;
3308
3309   memset(&if_stack, 0, sizeof(if_stack));
3310   foreach_iter(exec_list_iterator, iter, this->instructions) {
3311      fs_inst *inst = (fs_inst *)iter.get();
3312      struct brw_reg src[3], dst;
3313
3314      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3315	 if (last_annotation_ir != inst->ir) {
3316	    last_annotation_ir = inst->ir;
3317	    if (last_annotation_ir) {
3318	       printf("   ");
3319	       last_annotation_ir->print();
3320	       printf("\n");
3321	    }
3322	 }
3323	 if (last_annotation_string != inst->annotation) {
3324	    last_annotation_string = inst->annotation;
3325	    if (last_annotation_string)
3326	       printf("   %s\n", last_annotation_string);
3327	 }
3328      }
3329
3330      for (unsigned int i = 0; i < 3; i++) {
3331	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3332      }
3333      dst = brw_reg_from_fs_reg(&inst->dst);
3334
3335      brw_set_conditionalmod(p, inst->conditional_mod);
3336      brw_set_predicate_control(p, inst->predicated);
3337      brw_set_saturate(p, inst->saturate);
3338
3339      switch (inst->opcode) {
3340      case BRW_OPCODE_MOV:
3341	 brw_MOV(p, dst, src[0]);
3342	 break;
3343      case BRW_OPCODE_ADD:
3344	 brw_ADD(p, dst, src[0], src[1]);
3345	 break;
3346      case BRW_OPCODE_MUL:
3347	 brw_MUL(p, dst, src[0], src[1]);
3348	 break;
3349
3350      case BRW_OPCODE_FRC:
3351	 brw_FRC(p, dst, src[0]);
3352	 break;
3353      case BRW_OPCODE_RNDD:
3354	 brw_RNDD(p, dst, src[0]);
3355	 break;
3356      case BRW_OPCODE_RNDE:
3357	 brw_RNDE(p, dst, src[0]);
3358	 break;
3359      case BRW_OPCODE_RNDZ:
3360	 brw_RNDZ(p, dst, src[0]);
3361	 break;
3362
3363      case BRW_OPCODE_AND:
3364	 brw_AND(p, dst, src[0], src[1]);
3365	 break;
3366      case BRW_OPCODE_OR:
3367	 brw_OR(p, dst, src[0], src[1]);
3368	 break;
3369      case BRW_OPCODE_XOR:
3370	 brw_XOR(p, dst, src[0], src[1]);
3371	 break;
3372      case BRW_OPCODE_NOT:
3373	 brw_NOT(p, dst, src[0]);
3374	 break;
3375      case BRW_OPCODE_ASR:
3376	 brw_ASR(p, dst, src[0], src[1]);
3377	 break;
3378      case BRW_OPCODE_SHR:
3379	 brw_SHR(p, dst, src[0], src[1]);
3380	 break;
3381      case BRW_OPCODE_SHL:
3382	 brw_SHL(p, dst, src[0], src[1]);
3383	 break;
3384
3385      case BRW_OPCODE_CMP:
3386	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3387	 break;
3388      case BRW_OPCODE_SEL:
3389	 brw_SEL(p, dst, src[0], src[1]);
3390	 break;
3391
3392      case BRW_OPCODE_IF:
3393	 assert(if_stack_depth < 16);
3394	 if (inst->src[0].file != BAD_FILE) {
3395	    assert(intel->gen >= 6);
3396	    if_stack[if_stack_depth] = brw_IF_gen6(p, inst->conditional_mod, src[0], src[1]);
3397	 } else {
3398	    if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3399	 }
3400	 if_depth_in_loop[loop_stack_depth]++;
3401	 if_stack_depth++;
3402	 break;
3403
3404      case BRW_OPCODE_ELSE:
3405	 if_stack[if_stack_depth - 1] =
3406	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
3407	 break;
3408      case BRW_OPCODE_ENDIF:
3409	 if_stack_depth--;
3410	 brw_ENDIF(p , if_stack[if_stack_depth]);
3411	 if_depth_in_loop[loop_stack_depth]--;
3412	 break;
3413
3414      case BRW_OPCODE_DO:
3415	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3416	 if_depth_in_loop[loop_stack_depth] = 0;
3417	 break;
3418
3419      case BRW_OPCODE_BREAK:
3420	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3421	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3422	 break;
3423      case BRW_OPCODE_CONTINUE:
3424	 /* FINISHME: We need to write the loop instruction support still. */
3425	 if (intel->gen >= 6)
3426	    brw_CONT_gen6(p, loop_stack[loop_stack_depth - 1]);
3427	 else
3428	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3429	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3430	 break;
3431
3432      case BRW_OPCODE_WHILE: {
3433	 struct brw_instruction *inst0, *inst1;
3434	 GLuint br = 1;
3435
3436	 if (intel->gen >= 5)
3437	    br = 2;
3438
3439	 assert(loop_stack_depth > 0);
3440	 loop_stack_depth--;
3441	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3442	 if (intel->gen < 6) {
3443	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
3444	    while (inst0 > loop_stack[loop_stack_depth]) {
3445	       inst0--;
3446	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3447		   inst0->bits3.if_else.jump_count == 0) {
3448		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3449	    }
3450	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3451			inst0->bits3.if_else.jump_count == 0) {
3452		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3453	       }
3454	    }
3455	 }
3456      }
3457	 break;
3458
3459      case FS_OPCODE_RCP:
3460      case FS_OPCODE_RSQ:
3461      case FS_OPCODE_SQRT:
3462      case FS_OPCODE_EXP2:
3463      case FS_OPCODE_LOG2:
3464      case FS_OPCODE_POW:
3465      case FS_OPCODE_SIN:
3466      case FS_OPCODE_COS:
3467	 generate_math(inst, dst, src);
3468	 break;
3469      case FS_OPCODE_LINTERP:
3470	 generate_linterp(inst, dst, src);
3471	 break;
3472      case FS_OPCODE_TEX:
3473      case FS_OPCODE_TXB:
3474      case FS_OPCODE_TXL:
3475	 generate_tex(inst, dst);
3476	 break;
3477      case FS_OPCODE_DISCARD_NOT:
3478	 generate_discard_not(inst, dst);
3479	 break;
3480      case FS_OPCODE_DISCARD_AND:
3481	 generate_discard_and(inst, src[0]);
3482	 break;
3483      case FS_OPCODE_DDX:
3484	 generate_ddx(inst, dst, src[0]);
3485	 break;
3486      case FS_OPCODE_DDY:
3487	 generate_ddy(inst, dst, src[0]);
3488	 break;
3489
3490      case FS_OPCODE_SPILL:
3491	 generate_spill(inst, src[0]);
3492	 break;
3493
3494      case FS_OPCODE_UNSPILL:
3495	 generate_unspill(inst, dst);
3496	 break;
3497
3498      case FS_OPCODE_PULL_CONSTANT_LOAD:
3499	 generate_pull_constant_load(inst, dst);
3500	 break;
3501
3502      case FS_OPCODE_FB_WRITE:
3503	 generate_fb_write(inst);
3504	 break;
3505      default:
3506	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3507	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3508			  brw_opcodes[inst->opcode].name);
3509	 } else {
3510	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3511	 }
3512	 this->fail = true;
3513      }
3514
3515      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3516	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3517	    if (0) {
3518	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3519		      ((uint32_t *)&p->store[i])[3],
3520		      ((uint32_t *)&p->store[i])[2],
3521		      ((uint32_t *)&p->store[i])[1],
3522		      ((uint32_t *)&p->store[i])[0]);
3523	    }
3524	    brw_disasm(stdout, &p->store[i], intel->gen);
3525	 }
3526      }
3527
3528      last_native_inst = p->nr_insn;
3529   }
3530
3531   brw_set_uip_jip(p);
3532
3533   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3534    * emit issues, it doesn't get the jump distances into the output,
3535    * which is often something we want to debug.  So this is here in
3536    * case you're doing that.
3537    */
3538   if (0) {
3539      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3540	 for (unsigned int i = 0; i < p->nr_insn; i++) {
3541	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3542		   ((uint32_t *)&p->store[i])[3],
3543		   ((uint32_t *)&p->store[i])[2],
3544		   ((uint32_t *)&p->store[i])[1],
3545		   ((uint32_t *)&p->store[i])[0]);
3546	    brw_disasm(stdout, &p->store[i], intel->gen);
3547	 }
3548      }
3549   }
3550}
3551
3552GLboolean
3553brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3554{
3555   struct intel_context *intel = &brw->intel;
3556   struct gl_context *ctx = &intel->ctx;
3557   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3558
3559   if (!prog)
3560      return GL_FALSE;
3561
3562   struct brw_shader *shader =
3563     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3564   if (!shader)
3565      return GL_FALSE;
3566
3567   /* We always use 8-wide mode, at least for now.  For one, flow
3568    * control only works in 8-wide.  Also, when we're fragment shader
3569    * bound, we're almost always under register pressure as well, so
3570    * 8-wide would save us from the performance cliff of spilling
3571    * regs.
3572    */
3573   c->dispatch_width = 8;
3574
3575   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3576      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3577      _mesa_print_ir(shader->ir, NULL);
3578      printf("\n");
3579   }
3580
3581   /* Now the main event: Visit the shader IR and generate our FS IR for it.
3582    */
3583   fs_visitor v(c, shader);
3584
3585   if (0) {
3586      v.emit_dummy_fs();
3587   } else {
3588      v.calculate_urb_setup();
3589      if (intel->gen < 6)
3590	 v.emit_interpolation_setup_gen4();
3591      else
3592	 v.emit_interpolation_setup_gen6();
3593
3594      /* Generate FS IR for main().  (the visitor only descends into
3595       * functions called "main").
3596       */
3597      foreach_iter(exec_list_iterator, iter, *shader->ir) {
3598	 ir_instruction *ir = (ir_instruction *)iter.get();
3599	 v.base_ir = ir;
3600	 ir->accept(&v);
3601      }
3602
3603      v.emit_fb_writes();
3604
3605      v.split_virtual_grfs();
3606      v.setup_pull_constants();
3607
3608      v.assign_curb_setup();
3609      v.assign_urb_setup();
3610
3611      bool progress;
3612      do {
3613	 progress = false;
3614
3615	 progress = v.remove_duplicate_mrf_writes() || progress;
3616
3617	 v.calculate_live_intervals();
3618	 progress = v.propagate_constants() || progress;
3619	 progress = v.register_coalesce() || progress;
3620	 progress = v.compute_to_mrf() || progress;
3621	 progress = v.dead_code_eliminate() || progress;
3622      } while (progress);
3623
3624      if (0) {
3625	 /* Debug of register spilling: Go spill everything. */
3626	 int virtual_grf_count = v.virtual_grf_next;
3627	 for (int i = 1; i < virtual_grf_count; i++) {
3628	    v.spill_reg(i);
3629	 }
3630	 v.calculate_live_intervals();
3631      }
3632
3633      if (0)
3634	 v.assign_regs_trivial();
3635      else {
3636	 while (!v.assign_regs()) {
3637	    if (v.fail)
3638	       break;
3639
3640	    v.calculate_live_intervals();
3641	 }
3642      }
3643   }
3644
3645   if (!v.fail)
3646      v.generate_code();
3647
3648   assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
3649
3650   if (v.fail)
3651      return GL_FALSE;
3652
3653   c->prog_data.total_grf = v.grf_used;
3654
3655   return GL_TRUE;
3656}
3657