brw_fs.cpp revision 7c7df146b59bae9dcb3a271bd3c671e273015617
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44#include "talloc.h"
45}
46#include "brw_fs.h"
47#include "../glsl/glsl_types.h"
48#include "../glsl/ir_optimization.h"
49#include "../glsl/ir_print_visitor.h"
50
51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53struct gl_shader *
54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55{
56   struct brw_shader *shader;
57
58   shader = talloc_zero(NULL, struct brw_shader);
59   if (shader) {
60      shader->base.Type = type;
61      shader->base.Name = name;
62      _mesa_init_shader(ctx, &shader->base);
63   }
64
65   return &shader->base;
66}
67
68struct gl_shader_program *
69brw_new_shader_program(struct gl_context *ctx, GLuint name)
70{
71   struct brw_shader_program *prog;
72   prog = talloc_zero(NULL, struct brw_shader_program);
73   if (prog) {
74      prog->base.Name = name;
75      _mesa_init_shader_program(ctx, &prog->base);
76   }
77   return &prog->base;
78}
79
80GLboolean
81brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader)
82{
83   if (!_mesa_ir_compile_shader(ctx, shader))
84      return GL_FALSE;
85
86   return GL_TRUE;
87}
88
89GLboolean
90brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
91{
92   struct brw_context *brw = brw_context(ctx);
93   struct intel_context *intel = &brw->intel;
94
95   struct brw_shader *shader =
96      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
97   if (shader != NULL) {
98      void *mem_ctx = talloc_new(NULL);
99      bool progress;
100
101      if (shader->ir)
102	 talloc_free(shader->ir);
103      shader->ir = new(shader) exec_list;
104      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
105
106      do_mat_op_to_vec(shader->ir);
107      lower_instructions(shader->ir,
108			 MOD_TO_FRACT |
109			 DIV_TO_MUL_RCP |
110			 SUB_TO_ADD_NEG |
111			 EXP_TO_EXP2 |
112			 LOG_TO_LOG2);
113
114      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
115       * if-statements need to be flattened.
116       */
117      if (intel->gen < 6)
118	 lower_if_to_cond_assign(shader->ir, 16);
119
120      do_lower_texture_projection(shader->ir);
121      do_vec_index_to_cond_assign(shader->ir);
122      brw_do_cubemap_normalize(shader->ir);
123
124      do {
125	 progress = false;
126
127	 brw_do_channel_expressions(shader->ir);
128	 brw_do_vector_splitting(shader->ir);
129
130	 progress = do_lower_jumps(shader->ir, true, true,
131				   true, /* main return */
132				   false, /* continue */
133				   false /* loops */
134				   ) || progress;
135
136	 progress = do_common_optimization(shader->ir, true, 32) || progress;
137
138	 progress = lower_noise(shader->ir) || progress;
139	 progress =
140	    lower_variable_index_to_cond_assign(shader->ir,
141						GL_TRUE, /* input */
142						GL_TRUE, /* output */
143						GL_TRUE, /* temp */
144						GL_TRUE /* uniform */
145						) || progress;
146	 progress = lower_quadop_vector(shader->ir, false) || progress;
147      } while (progress);
148
149      validate_ir_tree(shader->ir);
150
151      reparent_ir(shader->ir, shader->ir);
152      talloc_free(mem_ctx);
153   }
154
155   if (!_mesa_ir_link_shader(ctx, prog))
156      return GL_FALSE;
157
158   return GL_TRUE;
159}
160
161static int
162type_size(const struct glsl_type *type)
163{
164   unsigned int size, i;
165
166   switch (type->base_type) {
167   case GLSL_TYPE_UINT:
168   case GLSL_TYPE_INT:
169   case GLSL_TYPE_FLOAT:
170   case GLSL_TYPE_BOOL:
171      return type->components();
172   case GLSL_TYPE_ARRAY:
173      return type_size(type->fields.array) * type->length;
174   case GLSL_TYPE_STRUCT:
175      size = 0;
176      for (i = 0; i < type->length; i++) {
177	 size += type_size(type->fields.structure[i].type);
178      }
179      return size;
180   case GLSL_TYPE_SAMPLER:
181      /* Samplers take up no register space, since they're baked in at
182       * link time.
183       */
184      return 0;
185   default:
186      assert(!"not reached");
187      return 0;
188   }
189}
190
191/**
192 * Returns how many MRFs an FS opcode will write over.
193 *
194 * Note that this is not the 0 or 1 implied writes in an actual gen
195 * instruction -- the FS opcodes often generate MOVs in addition.
196 */
197int
198fs_visitor::implied_mrf_writes(fs_inst *inst)
199{
200   if (inst->mlen == 0)
201      return 0;
202
203   switch (inst->opcode) {
204   case FS_OPCODE_RCP:
205   case FS_OPCODE_RSQ:
206   case FS_OPCODE_SQRT:
207   case FS_OPCODE_EXP2:
208   case FS_OPCODE_LOG2:
209   case FS_OPCODE_SIN:
210   case FS_OPCODE_COS:
211      return 1;
212   case FS_OPCODE_POW:
213      return 2;
214   case FS_OPCODE_TEX:
215   case FS_OPCODE_TXB:
216   case FS_OPCODE_TXL:
217      return 1;
218   case FS_OPCODE_FB_WRITE:
219      return 2;
220   case FS_OPCODE_PULL_CONSTANT_LOAD:
221   case FS_OPCODE_UNSPILL:
222      return 1;
223   case FS_OPCODE_SPILL:
224      return 2;
225   default:
226      assert(!"not reached");
227      return inst->mlen;
228   }
229}
230
231int
232fs_visitor::virtual_grf_alloc(int size)
233{
234   if (virtual_grf_array_size <= virtual_grf_next) {
235      if (virtual_grf_array_size == 0)
236	 virtual_grf_array_size = 16;
237      else
238	 virtual_grf_array_size *= 2;
239      virtual_grf_sizes = talloc_realloc(mem_ctx, virtual_grf_sizes,
240					 int, virtual_grf_array_size);
241
242      /* This slot is always unused. */
243      virtual_grf_sizes[0] = 0;
244   }
245   virtual_grf_sizes[virtual_grf_next] = size;
246   return virtual_grf_next++;
247}
248
249/** Fixed HW reg constructor. */
250fs_reg::fs_reg(enum register_file file, int hw_reg)
251{
252   init();
253   this->file = file;
254   this->hw_reg = hw_reg;
255   this->type = BRW_REGISTER_TYPE_F;
256}
257
258/** Fixed HW reg constructor. */
259fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
260{
261   init();
262   this->file = file;
263   this->hw_reg = hw_reg;
264   this->type = type;
265}
266
267int
268brw_type_for_base_type(const struct glsl_type *type)
269{
270   switch (type->base_type) {
271   case GLSL_TYPE_FLOAT:
272      return BRW_REGISTER_TYPE_F;
273   case GLSL_TYPE_INT:
274   case GLSL_TYPE_BOOL:
275      return BRW_REGISTER_TYPE_D;
276   case GLSL_TYPE_UINT:
277      return BRW_REGISTER_TYPE_UD;
278   case GLSL_TYPE_ARRAY:
279   case GLSL_TYPE_STRUCT:
280   case GLSL_TYPE_SAMPLER:
281      /* These should be overridden with the type of the member when
282       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
283       * way to trip up if we don't.
284       */
285      return BRW_REGISTER_TYPE_UD;
286   default:
287      assert(!"not reached");
288      return BRW_REGISTER_TYPE_F;
289   }
290}
291
292/** Automatic reg constructor. */
293fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
294{
295   init();
296
297   this->file = GRF;
298   this->reg = v->virtual_grf_alloc(type_size(type));
299   this->reg_offset = 0;
300   this->type = brw_type_for_base_type(type);
301}
302
303fs_reg *
304fs_visitor::variable_storage(ir_variable *var)
305{
306   return (fs_reg *)hash_table_find(this->variable_ht, var);
307}
308
309/* Our support for uniforms is piggy-backed on the struct
310 * gl_fragment_program, because that's where the values actually
311 * get stored, rather than in some global gl_shader_program uniform
312 * store.
313 */
314int
315fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
316{
317   unsigned int offset = 0;
318   float *vec_values;
319
320   if (type->is_matrix()) {
321      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
322							type->vector_elements,
323							1);
324
325      for (unsigned int i = 0; i < type->matrix_columns; i++) {
326	 offset += setup_uniform_values(loc + offset, column);
327      }
328
329      return offset;
330   }
331
332   switch (type->base_type) {
333   case GLSL_TYPE_FLOAT:
334   case GLSL_TYPE_UINT:
335   case GLSL_TYPE_INT:
336   case GLSL_TYPE_BOOL:
337      vec_values = fp->Base.Parameters->ParameterValues[loc];
338      for (unsigned int i = 0; i < type->vector_elements; i++) {
339	 unsigned int param = c->prog_data.nr_params++;
340
341	 assert(param < ARRAY_SIZE(c->prog_data.param));
342
343	 switch (type->base_type) {
344	 case GLSL_TYPE_FLOAT:
345	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
346	    break;
347	 case GLSL_TYPE_UINT:
348	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
349	    break;
350	 case GLSL_TYPE_INT:
351	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
352	    break;
353	 case GLSL_TYPE_BOOL:
354	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
355	    break;
356	 default:
357	    assert(!"not reached");
358	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
359	    break;
360	 }
361
362	 c->prog_data.param[param] = &vec_values[i];
363      }
364      return 1;
365
366   case GLSL_TYPE_STRUCT:
367      for (unsigned int i = 0; i < type->length; i++) {
368	 offset += setup_uniform_values(loc + offset,
369					type->fields.structure[i].type);
370      }
371      return offset;
372
373   case GLSL_TYPE_ARRAY:
374      for (unsigned int i = 0; i < type->length; i++) {
375	 offset += setup_uniform_values(loc + offset, type->fields.array);
376      }
377      return offset;
378
379   case GLSL_TYPE_SAMPLER:
380      /* The sampler takes up a slot, but we don't use any values from it. */
381      return 1;
382
383   default:
384      assert(!"not reached");
385      return 0;
386   }
387}
388
389
390/* Our support for builtin uniforms is even scarier than non-builtin.
391 * It sits on top of the PROG_STATE_VAR parameters that are
392 * automatically updated from GL context state.
393 */
394void
395fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
396{
397   const struct gl_builtin_uniform_desc *statevar = NULL;
398
399   for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
400      statevar = &_mesa_builtin_uniform_desc[i];
401      if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
402	 break;
403   }
404
405   if (!statevar->name) {
406      this->fail = true;
407      printf("Failed to find builtin uniform `%s'\n", ir->name);
408      return;
409   }
410
411   int array_count;
412   if (ir->type->is_array()) {
413      array_count = ir->type->length;
414   } else {
415      array_count = 1;
416   }
417
418   for (int a = 0; a < array_count; a++) {
419      for (unsigned int i = 0; i < statevar->num_elements; i++) {
420	 struct gl_builtin_uniform_element *element = &statevar->elements[i];
421	 int tokens[STATE_LENGTH];
422
423	 memcpy(tokens, element->tokens, sizeof(element->tokens));
424	 if (ir->type->is_array()) {
425	    tokens[1] = a;
426	 }
427
428	 /* This state reference has already been setup by ir_to_mesa,
429	  * but we'll get the same index back here.
430	  */
431	 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
432					       (gl_state_index *)tokens);
433	 float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
434
435	 /* Add each of the unique swizzles of the element as a
436	  * parameter.  This'll end up matching the expected layout of
437	  * the array/matrix/structure we're trying to fill in.
438	  */
439	 int last_swiz = -1;
440	 for (unsigned int i = 0; i < 4; i++) {
441	    int swiz = GET_SWZ(element->swizzle, i);
442	    if (swiz == last_swiz)
443	       break;
444	    last_swiz = swiz;
445
446	    c->prog_data.param_convert[c->prog_data.nr_params] =
447	       PARAM_NO_CONVERT;
448	    c->prog_data.param[c->prog_data.nr_params++] = &vec_values[swiz];
449	 }
450      }
451   }
452}
453
454fs_reg *
455fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
456{
457   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
458   fs_reg wpos = *reg;
459   fs_reg neg_y = this->pixel_y;
460   neg_y.negate = true;
461   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
462
463   /* gl_FragCoord.x */
464   if (ir->pixel_center_integer) {
465      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x));
466   } else {
467      emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)));
468   }
469   wpos.reg_offset++;
470
471   /* gl_FragCoord.y */
472   if (!flip && ir->pixel_center_integer) {
473      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y));
474   } else {
475      fs_reg pixel_y = this->pixel_y;
476      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
477
478      if (flip) {
479	 pixel_y.negate = true;
480	 offset += c->key.drawable_height - 1.0;
481      }
482
483      emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)));
484   }
485   wpos.reg_offset++;
486
487   /* gl_FragCoord.z */
488   if (intel->gen >= 6) {
489      emit(fs_inst(BRW_OPCODE_MOV, wpos,
490		   fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
491   } else {
492      emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
493		   interp_reg(FRAG_ATTRIB_WPOS, 2)));
494   }
495   wpos.reg_offset++;
496
497   /* gl_FragCoord.w: Already set up in emit_interpolation */
498   emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w));
499
500   return reg;
501}
502
503fs_reg *
504fs_visitor::emit_general_interpolation(ir_variable *ir)
505{
506   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
507   /* Interpolation is always in floating point regs. */
508   reg->type = BRW_REGISTER_TYPE_F;
509   fs_reg attr = *reg;
510
511   unsigned int array_elements;
512   const glsl_type *type;
513
514   if (ir->type->is_array()) {
515      array_elements = ir->type->length;
516      if (array_elements == 0) {
517	 this->fail = true;
518      }
519      type = ir->type->fields.array;
520   } else {
521      array_elements = 1;
522      type = ir->type;
523   }
524
525   int location = ir->location;
526   for (unsigned int i = 0; i < array_elements; i++) {
527      for (unsigned int j = 0; j < type->matrix_columns; j++) {
528	 if (urb_setup[location] == -1) {
529	    /* If there's no incoming setup data for this slot, don't
530	     * emit interpolation for it.
531	     */
532	    attr.reg_offset += type->vector_elements;
533	    location++;
534	    continue;
535	 }
536
537	 if (c->key.flat_shade && (location == FRAG_ATTRIB_COL0 ||
538				   location == FRAG_ATTRIB_COL1)) {
539	    /* Constant interpolation (flat shading) case. The SF has
540	     * handed us defined values in only the constant offset
541	     * field of the setup reg.
542	     */
543	    for (unsigned int c = 0; c < type->vector_elements; c++) {
544	       struct brw_reg interp = interp_reg(location, c);
545	       interp = suboffset(interp, 3);
546	       emit(fs_inst(FS_OPCODE_CINTERP, attr, fs_reg(interp)));
547	       attr.reg_offset++;
548	    }
549	 } else {
550	    /* Perspective interpolation case. */
551	    for (unsigned int c = 0; c < type->vector_elements; c++) {
552	       struct brw_reg interp = interp_reg(location, c);
553	       emit(fs_inst(FS_OPCODE_LINTERP,
554			    attr,
555			    this->delta_x,
556			    this->delta_y,
557			    fs_reg(interp)));
558	       attr.reg_offset++;
559	    }
560
561	    if (intel->gen < 6) {
562	       attr.reg_offset -= type->vector_elements;
563	       for (unsigned int c = 0; c < type->vector_elements; c++) {
564		  emit(fs_inst(BRW_OPCODE_MUL,
565			       attr,
566			       attr,
567			       this->pixel_w));
568		  attr.reg_offset++;
569	       }
570	    }
571	 }
572	 location++;
573      }
574   }
575
576   return reg;
577}
578
579fs_reg *
580fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
581{
582   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
583
584   /* The frontfacing comes in as a bit in the thread payload. */
585   if (intel->gen >= 6) {
586      emit(fs_inst(BRW_OPCODE_ASR,
587		   *reg,
588		   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
589		   fs_reg(15)));
590      emit(fs_inst(BRW_OPCODE_NOT,
591		   *reg,
592		   *reg));
593      emit(fs_inst(BRW_OPCODE_AND,
594		   *reg,
595		   *reg,
596		   fs_reg(1)));
597   } else {
598      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
599      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
600       * us front face
601       */
602      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP,
603				   *reg,
604				   fs_reg(r1_6ud),
605				   fs_reg(1u << 31)));
606      inst->conditional_mod = BRW_CONDITIONAL_L;
607      emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)));
608   }
609
610   return reg;
611}
612
613fs_inst *
614fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
615{
616   switch (opcode) {
617   case FS_OPCODE_RCP:
618   case FS_OPCODE_RSQ:
619   case FS_OPCODE_SQRT:
620   case FS_OPCODE_EXP2:
621   case FS_OPCODE_LOG2:
622   case FS_OPCODE_SIN:
623   case FS_OPCODE_COS:
624      break;
625   default:
626      assert(!"not reached: bad math opcode");
627      return NULL;
628   }
629
630   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
631    * might be able to do better by doing execsize = 1 math and then
632    * expanding that result out, but we would need to be careful with
633    * masking.
634    *
635    * The hardware ignores source modifiers (negate and abs) on math
636    * instructions, so we also move to a temp to set those up.
637    */
638   if (intel->gen >= 6 && (src.file == UNIFORM ||
639			   src.abs ||
640			   src.negate)) {
641      fs_reg expanded = fs_reg(this, glsl_type::float_type);
642      emit(fs_inst(BRW_OPCODE_MOV, expanded, src));
643      src = expanded;
644   }
645
646   fs_inst *inst = emit(fs_inst(opcode, dst, src));
647
648   if (intel->gen < 6) {
649      inst->base_mrf = 2;
650      inst->mlen = 1;
651   }
652
653   return inst;
654}
655
656fs_inst *
657fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
658{
659   int base_mrf = 2;
660   fs_inst *inst;
661
662   assert(opcode == FS_OPCODE_POW);
663
664   if (intel->gen >= 6) {
665      /* Can't do hstride == 0 args to gen6 math, so expand it out. */
666      if (src0.file == UNIFORM) {
667	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
668	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src0));
669	 src0 = expanded;
670      }
671
672      if (src1.file == UNIFORM) {
673	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
674	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src1));
675	 src1 = expanded;
676      }
677
678      inst = emit(fs_inst(opcode, dst, src0, src1));
679   } else {
680      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1));
681      inst = emit(fs_inst(opcode, dst, src0, reg_null_f));
682
683      inst->base_mrf = base_mrf;
684      inst->mlen = 2;
685   }
686   return inst;
687}
688
689void
690fs_visitor::visit(ir_variable *ir)
691{
692   fs_reg *reg = NULL;
693
694   if (variable_storage(ir))
695      return;
696
697   if (strcmp(ir->name, "gl_FragColor") == 0) {
698      this->frag_color = ir;
699   } else if (strcmp(ir->name, "gl_FragData") == 0) {
700      this->frag_data = ir;
701   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
702      this->frag_depth = ir;
703   }
704
705   if (ir->mode == ir_var_in) {
706      if (!strcmp(ir->name, "gl_FragCoord")) {
707	 reg = emit_fragcoord_interpolation(ir);
708      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
709	 reg = emit_frontfacing_interpolation(ir);
710      } else {
711	 reg = emit_general_interpolation(ir);
712      }
713      assert(reg);
714      hash_table_insert(this->variable_ht, reg, ir);
715      return;
716   }
717
718   if (ir->mode == ir_var_uniform) {
719      int param_index = c->prog_data.nr_params;
720
721      if (!strncmp(ir->name, "gl_", 3)) {
722	 setup_builtin_uniform_values(ir);
723      } else {
724	 setup_uniform_values(ir->location, ir->type);
725      }
726
727      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
728      reg->type = brw_type_for_base_type(ir->type);
729   }
730
731   if (!reg)
732      reg = new(this->mem_ctx) fs_reg(this, ir->type);
733
734   hash_table_insert(this->variable_ht, reg, ir);
735}
736
737void
738fs_visitor::visit(ir_dereference_variable *ir)
739{
740   fs_reg *reg = variable_storage(ir->var);
741   this->result = *reg;
742}
743
744void
745fs_visitor::visit(ir_dereference_record *ir)
746{
747   const glsl_type *struct_type = ir->record->type;
748
749   ir->record->accept(this);
750
751   unsigned int offset = 0;
752   for (unsigned int i = 0; i < struct_type->length; i++) {
753      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
754	 break;
755      offset += type_size(struct_type->fields.structure[i].type);
756   }
757   this->result.reg_offset += offset;
758   this->result.type = brw_type_for_base_type(ir->type);
759}
760
761void
762fs_visitor::visit(ir_dereference_array *ir)
763{
764   ir_constant *index;
765   int element_size;
766
767   ir->array->accept(this);
768   index = ir->array_index->as_constant();
769
770   element_size = type_size(ir->type);
771   this->result.type = brw_type_for_base_type(ir->type);
772
773   if (index) {
774      assert(this->result.file == UNIFORM ||
775	     (this->result.file == GRF &&
776	      this->result.reg != 0));
777      this->result.reg_offset += index->value.i[0] * element_size;
778   } else {
779      assert(!"FINISHME: non-constant array element");
780   }
781}
782
783/* Instruction selection: Produce a MOV.sat instead of
784 * MIN(MAX(val, 0), 1) when possible.
785 */
786bool
787fs_visitor::try_emit_saturate(ir_expression *ir)
788{
789   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
790
791   if (!sat_val)
792      return false;
793
794   sat_val->accept(this);
795   fs_reg src = this->result;
796
797   this->result = fs_reg(this, ir->type);
798   fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, src));
799   inst->saturate = true;
800
801   return true;
802}
803
804static uint32_t
805brw_conditional_for_comparison(unsigned int op)
806{
807   switch (op) {
808   case ir_binop_less:
809      return BRW_CONDITIONAL_L;
810   case ir_binop_greater:
811      return BRW_CONDITIONAL_G;
812   case ir_binop_lequal:
813      return BRW_CONDITIONAL_LE;
814   case ir_binop_gequal:
815      return BRW_CONDITIONAL_GE;
816   case ir_binop_equal:
817   case ir_binop_all_equal: /* same as equal for scalars */
818      return BRW_CONDITIONAL_Z;
819   case ir_binop_nequal:
820   case ir_binop_any_nequal: /* same as nequal for scalars */
821      return BRW_CONDITIONAL_NZ;
822   default:
823      assert(!"not reached: bad operation for comparison");
824      return BRW_CONDITIONAL_NZ;
825   }
826}
827
828void
829fs_visitor::visit(ir_expression *ir)
830{
831   unsigned int operand;
832   fs_reg op[2], temp;
833   fs_inst *inst;
834
835   assert(ir->get_num_operands() <= 2);
836
837   if (try_emit_saturate(ir))
838      return;
839
840   for (operand = 0; operand < ir->get_num_operands(); operand++) {
841      ir->operands[operand]->accept(this);
842      if (this->result.file == BAD_FILE) {
843	 ir_print_visitor v;
844	 printf("Failed to get tree for expression operand:\n");
845	 ir->operands[operand]->accept(&v);
846	 this->fail = true;
847      }
848      op[operand] = this->result;
849
850      /* Matrix expression operands should have been broken down to vector
851       * operations already.
852       */
853      assert(!ir->operands[operand]->type->is_matrix());
854      /* And then those vector operands should have been broken down to scalar.
855       */
856      assert(!ir->operands[operand]->type->is_vector());
857   }
858
859   /* Storage for our result.  If our result goes into an assignment, it will
860    * just get copy-propagated out, so no worries.
861    */
862   this->result = fs_reg(this, ir->type);
863
864   switch (ir->operation) {
865   case ir_unop_logic_not:
866      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
867       * ones complement of the whole register, not just bit 0.
868       */
869      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)));
870      break;
871   case ir_unop_neg:
872      op[0].negate = !op[0].negate;
873      this->result = op[0];
874      break;
875   case ir_unop_abs:
876      op[0].abs = true;
877      op[0].negate = false;
878      this->result = op[0];
879      break;
880   case ir_unop_sign:
881      temp = fs_reg(this, ir->type);
882
883      emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)));
884
885      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
886      inst->conditional_mod = BRW_CONDITIONAL_G;
887      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)));
888      inst->predicated = true;
889
890      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
891      inst->conditional_mod = BRW_CONDITIONAL_L;
892      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)));
893      inst->predicated = true;
894
895      break;
896   case ir_unop_rcp:
897      emit_math(FS_OPCODE_RCP, this->result, op[0]);
898      break;
899
900   case ir_unop_exp2:
901      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
902      break;
903   case ir_unop_log2:
904      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
905      break;
906   case ir_unop_exp:
907   case ir_unop_log:
908      assert(!"not reached: should be handled by ir_explog_to_explog2");
909      break;
910   case ir_unop_sin:
911   case ir_unop_sin_reduced:
912      emit_math(FS_OPCODE_SIN, this->result, op[0]);
913      break;
914   case ir_unop_cos:
915   case ir_unop_cos_reduced:
916      emit_math(FS_OPCODE_COS, this->result, op[0]);
917      break;
918
919   case ir_unop_dFdx:
920      emit(fs_inst(FS_OPCODE_DDX, this->result, op[0]));
921      break;
922   case ir_unop_dFdy:
923      emit(fs_inst(FS_OPCODE_DDY, this->result, op[0]));
924      break;
925
926   case ir_binop_add:
927      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1]));
928      break;
929   case ir_binop_sub:
930      assert(!"not reached: should be handled by ir_sub_to_add_neg");
931      break;
932
933   case ir_binop_mul:
934      emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1]));
935      break;
936   case ir_binop_div:
937      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
938      break;
939   case ir_binop_mod:
940      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
941      break;
942
943   case ir_binop_less:
944   case ir_binop_greater:
945   case ir_binop_lequal:
946   case ir_binop_gequal:
947   case ir_binop_equal:
948   case ir_binop_all_equal:
949   case ir_binop_nequal:
950   case ir_binop_any_nequal:
951      temp = this->result;
952      /* original gen4 does implicit conversion before comparison. */
953      if (intel->gen < 5)
954	 temp.type = op[0].type;
955
956      inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], op[1]));
957      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
958      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
959      break;
960
961   case ir_binop_logic_xor:
962      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
963      break;
964
965   case ir_binop_logic_or:
966      emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
967      break;
968
969   case ir_binop_logic_and:
970      emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
971      break;
972
973   case ir_binop_dot:
974   case ir_unop_any:
975      assert(!"not reached: should be handled by brw_fs_channel_expressions");
976      break;
977
978   case ir_unop_noise:
979      assert(!"not reached: should be handled by lower_noise");
980      break;
981
982   case ir_quadop_vector:
983      assert(!"not reached: should be handled by lower_quadop_vector");
984      break;
985
986   case ir_unop_sqrt:
987      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
988      break;
989
990   case ir_unop_rsq:
991      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
992      break;
993
994   case ir_unop_i2f:
995   case ir_unop_b2f:
996   case ir_unop_b2i:
997   case ir_unop_f2i:
998      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
999      break;
1000   case ir_unop_f2b:
1001   case ir_unop_i2b:
1002      temp = this->result;
1003      /* original gen4 does implicit conversion before comparison. */
1004      if (intel->gen < 5)
1005	 temp.type = op[0].type;
1006
1007      inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)));
1008      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1009      inst = emit(fs_inst(BRW_OPCODE_AND, this->result,
1010			  this->result, fs_reg(1)));
1011      break;
1012
1013   case ir_unop_trunc:
1014      emit(fs_inst(BRW_OPCODE_RNDZ, this->result, op[0]));
1015      break;
1016   case ir_unop_ceil:
1017      op[0].negate = !op[0].negate;
1018      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1019      this->result.negate = true;
1020      break;
1021   case ir_unop_floor:
1022      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1023      break;
1024   case ir_unop_fract:
1025      inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0]));
1026      break;
1027   case ir_unop_round_even:
1028      emit(fs_inst(BRW_OPCODE_RNDE, this->result, op[0]));
1029      break;
1030
1031   case ir_binop_min:
1032      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1033      inst->conditional_mod = BRW_CONDITIONAL_L;
1034
1035      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
1036      inst->predicated = true;
1037      break;
1038   case ir_binop_max:
1039      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1040      inst->conditional_mod = BRW_CONDITIONAL_G;
1041
1042      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
1043      inst->predicated = true;
1044      break;
1045
1046   case ir_binop_pow:
1047      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1048      break;
1049
1050   case ir_unop_bit_not:
1051      inst = emit(fs_inst(BRW_OPCODE_NOT, this->result, op[0]));
1052      break;
1053   case ir_binop_bit_and:
1054      inst = emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
1055      break;
1056   case ir_binop_bit_xor:
1057      inst = emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
1058      break;
1059   case ir_binop_bit_or:
1060      inst = emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
1061      break;
1062
1063   case ir_unop_u2f:
1064   case ir_binop_lshift:
1065   case ir_binop_rshift:
1066      assert(!"GLSL 1.30 features unsupported");
1067      break;
1068   }
1069}
1070
1071void
1072fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1073				   const glsl_type *type, bool predicated)
1074{
1075   switch (type->base_type) {
1076   case GLSL_TYPE_FLOAT:
1077   case GLSL_TYPE_UINT:
1078   case GLSL_TYPE_INT:
1079   case GLSL_TYPE_BOOL:
1080      for (unsigned int i = 0; i < type->components(); i++) {
1081	 l.type = brw_type_for_base_type(type);
1082	 r.type = brw_type_for_base_type(type);
1083
1084	 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1085	 inst->predicated = predicated;
1086
1087	 l.reg_offset++;
1088	 r.reg_offset++;
1089      }
1090      break;
1091   case GLSL_TYPE_ARRAY:
1092      for (unsigned int i = 0; i < type->length; i++) {
1093	 emit_assignment_writes(l, r, type->fields.array, predicated);
1094      }
1095      break;
1096
1097   case GLSL_TYPE_STRUCT:
1098      for (unsigned int i = 0; i < type->length; i++) {
1099	 emit_assignment_writes(l, r, type->fields.structure[i].type,
1100				predicated);
1101      }
1102      break;
1103
1104   case GLSL_TYPE_SAMPLER:
1105      break;
1106
1107   default:
1108      assert(!"not reached");
1109      break;
1110   }
1111}
1112
1113void
1114fs_visitor::visit(ir_assignment *ir)
1115{
1116   struct fs_reg l, r;
1117   fs_inst *inst;
1118
1119   /* FINISHME: arrays on the lhs */
1120   ir->lhs->accept(this);
1121   l = this->result;
1122
1123   ir->rhs->accept(this);
1124   r = this->result;
1125
1126   assert(l.file != BAD_FILE);
1127   assert(r.file != BAD_FILE);
1128
1129   if (ir->condition) {
1130      emit_bool_to_cond_code(ir->condition);
1131   }
1132
1133   if (ir->lhs->type->is_scalar() ||
1134       ir->lhs->type->is_vector()) {
1135      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1136	 if (ir->write_mask & (1 << i)) {
1137	    inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1138	    if (ir->condition)
1139	       inst->predicated = true;
1140	    r.reg_offset++;
1141	 }
1142	 l.reg_offset++;
1143      }
1144   } else {
1145      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1146   }
1147}
1148
1149fs_inst *
1150fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1151{
1152   int mlen;
1153   int base_mrf = 1;
1154   bool simd16 = false;
1155   fs_reg orig_dst;
1156
1157   /* g0 header. */
1158   mlen = 1;
1159
1160   if (ir->shadow_comparitor) {
1161      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1162	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1163		      coordinate));
1164	 coordinate.reg_offset++;
1165      }
1166      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1167      mlen += 3;
1168
1169      if (ir->op == ir_tex) {
1170	 /* There's no plain shadow compare message, so we use shadow
1171	  * compare with a bias of 0.0.
1172	  */
1173	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1174		      fs_reg(0.0f)));
1175	 mlen++;
1176      } else if (ir->op == ir_txb) {
1177	 ir->lod_info.bias->accept(this);
1178	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1179		      this->result));
1180	 mlen++;
1181      } else {
1182	 assert(ir->op == ir_txl);
1183	 ir->lod_info.lod->accept(this);
1184	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1185		      this->result));
1186	 mlen++;
1187      }
1188
1189      ir->shadow_comparitor->accept(this);
1190      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1191      mlen++;
1192   } else if (ir->op == ir_tex) {
1193      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1194	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1195		      coordinate));
1196	 coordinate.reg_offset++;
1197      }
1198      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1199      mlen += 3;
1200   } else {
1201      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1202       * instructions.  We'll need to do SIMD16 here.
1203       */
1204      assert(ir->op == ir_txb || ir->op == ir_txl);
1205
1206      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1207	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2),
1208		      coordinate));
1209	 coordinate.reg_offset++;
1210      }
1211
1212      /* lod/bias appears after u/v/r. */
1213      mlen += 6;
1214
1215      if (ir->op == ir_txb) {
1216	 ir->lod_info.bias->accept(this);
1217	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1218		      this->result));
1219	 mlen++;
1220      } else {
1221	 ir->lod_info.lod->accept(this);
1222	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1223		      this->result));
1224	 mlen++;
1225      }
1226
1227      /* The unused upper half. */
1228      mlen++;
1229
1230      /* Now, since we're doing simd16, the return is 2 interleaved
1231       * vec4s where the odd-indexed ones are junk. We'll need to move
1232       * this weirdness around to the expected layout.
1233       */
1234      simd16 = true;
1235      orig_dst = dst;
1236      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1237						       2));
1238      dst.type = BRW_REGISTER_TYPE_F;
1239   }
1240
1241   fs_inst *inst = NULL;
1242   switch (ir->op) {
1243   case ir_tex:
1244      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1245      break;
1246   case ir_txb:
1247      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1248      break;
1249   case ir_txl:
1250      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1251      break;
1252   case ir_txd:
1253   case ir_txf:
1254      assert(!"GLSL 1.30 features unsupported");
1255      break;
1256   }
1257   inst->base_mrf = base_mrf;
1258   inst->mlen = mlen;
1259
1260   if (simd16) {
1261      for (int i = 0; i < 4; i++) {
1262	 emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst));
1263	 orig_dst.reg_offset++;
1264	 dst.reg_offset += 2;
1265      }
1266   }
1267
1268   return inst;
1269}
1270
1271fs_inst *
1272fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1273{
1274   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1275    * optional parameters like shadow comparitor or LOD bias.  If
1276    * optional parameters aren't present, those base slots are
1277    * optional and don't need to be included in the message.
1278    *
1279    * We don't fill in the unnecessary slots regardless, which may
1280    * look surprising in the disassembly.
1281    */
1282   int mlen = 1; /* g0 header always present. */
1283   int base_mrf = 1;
1284
1285   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1286      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1287		   coordinate));
1288      coordinate.reg_offset++;
1289   }
1290   mlen += ir->coordinate->type->vector_elements;
1291
1292   if (ir->shadow_comparitor) {
1293      mlen = MAX2(mlen, 5);
1294
1295      ir->shadow_comparitor->accept(this);
1296      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1297      mlen++;
1298   }
1299
1300   fs_inst *inst = NULL;
1301   switch (ir->op) {
1302   case ir_tex:
1303      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1304      break;
1305   case ir_txb:
1306      ir->lod_info.bias->accept(this);
1307      mlen = MAX2(mlen, 5);
1308      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1309      mlen++;
1310
1311      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1312      break;
1313   case ir_txl:
1314      ir->lod_info.lod->accept(this);
1315      mlen = MAX2(mlen, 5);
1316      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1317      mlen++;
1318
1319      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1320      break;
1321   case ir_txd:
1322   case ir_txf:
1323      assert(!"GLSL 1.30 features unsupported");
1324      break;
1325   }
1326   inst->base_mrf = base_mrf;
1327   inst->mlen = mlen;
1328
1329   return inst;
1330}
1331
1332void
1333fs_visitor::visit(ir_texture *ir)
1334{
1335   int sampler;
1336   fs_inst *inst = NULL;
1337
1338   ir->coordinate->accept(this);
1339   fs_reg coordinate = this->result;
1340
1341   /* Should be lowered by do_lower_texture_projection */
1342   assert(!ir->projector);
1343
1344   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1345					     ctx->Shader.CurrentFragmentProgram,
1346					     &brw->fragment_program->Base);
1347   sampler = c->fp->program.Base.SamplerUnits[sampler];
1348
1349   /* The 965 requires the EU to do the normalization of GL rectangle
1350    * texture coordinates.  We use the program parameter state
1351    * tracking to get the scaling factor.
1352    */
1353   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1354      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1355      int tokens[STATE_LENGTH] = {
1356	 STATE_INTERNAL,
1357	 STATE_TEXRECT_SCALE,
1358	 sampler,
1359	 0,
1360	 0
1361      };
1362
1363      c->prog_data.param_convert[c->prog_data.nr_params] =
1364	 PARAM_NO_CONVERT;
1365      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1366	 PARAM_NO_CONVERT;
1367
1368      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1369      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1370      GLuint index = _mesa_add_state_reference(params,
1371					       (gl_state_index *)tokens);
1372      float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
1373
1374      c->prog_data.param[c->prog_data.nr_params++] = &vec_values[0];
1375      c->prog_data.param[c->prog_data.nr_params++] = &vec_values[1];
1376
1377      fs_reg dst = fs_reg(this, ir->coordinate->type);
1378      fs_reg src = coordinate;
1379      coordinate = dst;
1380
1381      emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_x));
1382      dst.reg_offset++;
1383      src.reg_offset++;
1384      emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_y));
1385   }
1386
1387   /* Writemasking doesn't eliminate channels on SIMD8 texture
1388    * samples, so don't worry about them.
1389    */
1390   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1391
1392   if (intel->gen < 5) {
1393      inst = emit_texture_gen4(ir, dst, coordinate);
1394   } else {
1395      inst = emit_texture_gen5(ir, dst, coordinate);
1396   }
1397
1398   inst->sampler = sampler;
1399
1400   this->result = dst;
1401
1402   if (ir->shadow_comparitor)
1403      inst->shadow_compare = true;
1404
1405   if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1406      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1407
1408      for (int i = 0; i < 4; i++) {
1409	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1410	 fs_reg l = swizzle_dst;
1411	 l.reg_offset += i;
1412
1413	 if (swiz == SWIZZLE_ZERO) {
1414	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f)));
1415	 } else if (swiz == SWIZZLE_ONE) {
1416	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f)));
1417	 } else {
1418	    fs_reg r = dst;
1419	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1420	    emit(fs_inst(BRW_OPCODE_MOV, l, r));
1421	 }
1422      }
1423      this->result = swizzle_dst;
1424   }
1425}
1426
1427void
1428fs_visitor::visit(ir_swizzle *ir)
1429{
1430   ir->val->accept(this);
1431   fs_reg val = this->result;
1432
1433   if (ir->type->vector_elements == 1) {
1434      this->result.reg_offset += ir->mask.x;
1435      return;
1436   }
1437
1438   fs_reg result = fs_reg(this, ir->type);
1439   this->result = result;
1440
1441   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1442      fs_reg channel = val;
1443      int swiz = 0;
1444
1445      switch (i) {
1446      case 0:
1447	 swiz = ir->mask.x;
1448	 break;
1449      case 1:
1450	 swiz = ir->mask.y;
1451	 break;
1452      case 2:
1453	 swiz = ir->mask.z;
1454	 break;
1455      case 3:
1456	 swiz = ir->mask.w;
1457	 break;
1458      }
1459
1460      channel.reg_offset += swiz;
1461      emit(fs_inst(BRW_OPCODE_MOV, result, channel));
1462      result.reg_offset++;
1463   }
1464}
1465
1466void
1467fs_visitor::visit(ir_discard *ir)
1468{
1469   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1470
1471   assert(ir->condition == NULL); /* FINISHME */
1472
1473   emit(fs_inst(FS_OPCODE_DISCARD_NOT, temp, reg_null_d));
1474   emit(fs_inst(FS_OPCODE_DISCARD_AND, reg_null_d, temp));
1475   kill_emitted = true;
1476}
1477
1478void
1479fs_visitor::visit(ir_constant *ir)
1480{
1481   /* Set this->result to reg at the bottom of the function because some code
1482    * paths will cause this visitor to be applied to other fields.  This will
1483    * cause the value stored in this->result to be modified.
1484    *
1485    * Make reg constant so that it doesn't get accidentally modified along the
1486    * way.  Yes, I actually had this problem. :(
1487    */
1488   const fs_reg reg(this, ir->type);
1489   fs_reg dst_reg = reg;
1490
1491   if (ir->type->is_array()) {
1492      const unsigned size = type_size(ir->type->fields.array);
1493
1494      for (unsigned i = 0; i < ir->type->length; i++) {
1495	 ir->array_elements[i]->accept(this);
1496	 fs_reg src_reg = this->result;
1497
1498	 dst_reg.type = src_reg.type;
1499	 for (unsigned j = 0; j < size; j++) {
1500	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
1501	    src_reg.reg_offset++;
1502	    dst_reg.reg_offset++;
1503	 }
1504      }
1505   } else if (ir->type->is_record()) {
1506      foreach_list(node, &ir->components) {
1507	 ir_instruction *const field = (ir_instruction *) node;
1508	 const unsigned size = type_size(field->type);
1509
1510	 field->accept(this);
1511	 fs_reg src_reg = this->result;
1512
1513	 dst_reg.type = src_reg.type;
1514	 for (unsigned j = 0; j < size; j++) {
1515	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
1516	    src_reg.reg_offset++;
1517	    dst_reg.reg_offset++;
1518	 }
1519      }
1520   } else {
1521      const unsigned size = type_size(ir->type);
1522
1523      for (unsigned i = 0; i < size; i++) {
1524	 switch (ir->type->base_type) {
1525	 case GLSL_TYPE_FLOAT:
1526	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])));
1527	    break;
1528	 case GLSL_TYPE_UINT:
1529	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])));
1530	    break;
1531	 case GLSL_TYPE_INT:
1532	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])));
1533	    break;
1534	 case GLSL_TYPE_BOOL:
1535	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])));
1536	    break;
1537	 default:
1538	    assert(!"Non-float/uint/int/bool constant");
1539	 }
1540	 dst_reg.reg_offset++;
1541      }
1542   }
1543
1544   this->result = reg;
1545}
1546
1547void
1548fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1549{
1550   ir_expression *expr = ir->as_expression();
1551
1552   if (expr) {
1553      fs_reg op[2];
1554      fs_inst *inst;
1555
1556      assert(expr->get_num_operands() <= 2);
1557      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1558	 assert(expr->operands[i]->type->is_scalar());
1559
1560	 expr->operands[i]->accept(this);
1561	 op[i] = this->result;
1562      }
1563
1564      switch (expr->operation) {
1565      case ir_unop_logic_not:
1566	 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)));
1567	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1568	 break;
1569
1570      case ir_binop_logic_xor:
1571	 inst = emit(fs_inst(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]));
1572	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1573	 break;
1574
1575      case ir_binop_logic_or:
1576	 inst = emit(fs_inst(BRW_OPCODE_OR, reg_null_d, op[0], op[1]));
1577	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1578	 break;
1579
1580      case ir_binop_logic_and:
1581	 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], op[1]));
1582	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1583	 break;
1584
1585      case ir_unop_f2b:
1586	 if (intel->gen >= 6) {
1587	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d,
1588				op[0], fs_reg(0.0f)));
1589	 } else {
1590	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_f, op[0]));
1591	 }
1592	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1593	 break;
1594
1595      case ir_unop_i2b:
1596	 if (intel->gen >= 6) {
1597	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)));
1598	 } else {
1599	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0]));
1600	 }
1601	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1602	 break;
1603
1604      case ir_binop_greater:
1605      case ir_binop_gequal:
1606      case ir_binop_less:
1607      case ir_binop_lequal:
1608      case ir_binop_equal:
1609      case ir_binop_all_equal:
1610      case ir_binop_nequal:
1611      case ir_binop_any_nequal:
1612	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]));
1613	 inst->conditional_mod =
1614	    brw_conditional_for_comparison(expr->operation);
1615	 break;
1616
1617      default:
1618	 assert(!"not reached");
1619	 this->fail = true;
1620	 break;
1621      }
1622      return;
1623   }
1624
1625   ir->accept(this);
1626
1627   if (intel->gen >= 6) {
1628      fs_inst *inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d,
1629				   this->result, fs_reg(1)));
1630      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1631   } else {
1632      fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, this->result));
1633      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1634   }
1635}
1636
1637/**
1638 * Emit a gen6 IF statement with the comparison folded into the IF
1639 * instruction.
1640 */
1641void
1642fs_visitor::emit_if_gen6(ir_if *ir)
1643{
1644   ir_expression *expr = ir->condition->as_expression();
1645
1646   if (expr) {
1647      fs_reg op[2];
1648      fs_inst *inst;
1649      fs_reg temp;
1650
1651      assert(expr->get_num_operands() <= 2);
1652      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1653	 assert(expr->operands[i]->type->is_scalar());
1654
1655	 expr->operands[i]->accept(this);
1656	 op[i] = this->result;
1657      }
1658
1659      switch (expr->operation) {
1660      case ir_unop_logic_not:
1661	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(0)));
1662	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1663	 return;
1664
1665      case ir_binop_logic_xor:
1666	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1667	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1668	 return;
1669
1670      case ir_binop_logic_or:
1671	 temp = fs_reg(this, glsl_type::bool_type);
1672	 emit(fs_inst(BRW_OPCODE_OR, temp, op[0], op[1]));
1673	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1674	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1675	 return;
1676
1677      case ir_binop_logic_and:
1678	 temp = fs_reg(this, glsl_type::bool_type);
1679	 emit(fs_inst(BRW_OPCODE_AND, temp, op[0], op[1]));
1680	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1681	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1682	 return;
1683
1684      case ir_unop_f2b:
1685	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)));
1686	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1687	 return;
1688
1689      case ir_unop_i2b:
1690	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1691	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1692	 return;
1693
1694      case ir_binop_greater:
1695      case ir_binop_gequal:
1696      case ir_binop_less:
1697      case ir_binop_lequal:
1698      case ir_binop_equal:
1699      case ir_binop_all_equal:
1700      case ir_binop_nequal:
1701      case ir_binop_any_nequal:
1702	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1703	 inst->conditional_mod =
1704	    brw_conditional_for_comparison(expr->operation);
1705	 return;
1706      default:
1707	 assert(!"not reached");
1708	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1709	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1710	 this->fail = true;
1711	 return;
1712      }
1713      return;
1714   }
1715
1716   ir->condition->accept(this);
1717
1718   fs_inst *inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)));
1719   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1720}
1721
1722void
1723fs_visitor::visit(ir_if *ir)
1724{
1725   fs_inst *inst;
1726
1727   /* Don't point the annotation at the if statement, because then it plus
1728    * the then and else blocks get printed.
1729    */
1730   this->base_ir = ir->condition;
1731
1732   if (intel->gen >= 6) {
1733      emit_if_gen6(ir);
1734   } else {
1735      emit_bool_to_cond_code(ir->condition);
1736
1737      inst = emit(fs_inst(BRW_OPCODE_IF));
1738      inst->predicated = true;
1739   }
1740
1741   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1742      ir_instruction *ir = (ir_instruction *)iter.get();
1743      this->base_ir = ir;
1744
1745      ir->accept(this);
1746   }
1747
1748   if (!ir->else_instructions.is_empty()) {
1749      emit(fs_inst(BRW_OPCODE_ELSE));
1750
1751      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1752	 ir_instruction *ir = (ir_instruction *)iter.get();
1753	 this->base_ir = ir;
1754
1755	 ir->accept(this);
1756      }
1757   }
1758
1759   emit(fs_inst(BRW_OPCODE_ENDIF));
1760}
1761
1762void
1763fs_visitor::visit(ir_loop *ir)
1764{
1765   fs_reg counter = reg_undef;
1766
1767   if (ir->counter) {
1768      this->base_ir = ir->counter;
1769      ir->counter->accept(this);
1770      counter = *(variable_storage(ir->counter));
1771
1772      if (ir->from) {
1773	 this->base_ir = ir->from;
1774	 ir->from->accept(this);
1775
1776	 emit(fs_inst(BRW_OPCODE_MOV, counter, this->result));
1777      }
1778   }
1779
1780   emit(fs_inst(BRW_OPCODE_DO));
1781
1782   if (ir->to) {
1783      this->base_ir = ir->to;
1784      ir->to->accept(this);
1785
1786      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp,
1787				   counter, this->result));
1788      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1789
1790      inst = emit(fs_inst(BRW_OPCODE_BREAK));
1791      inst->predicated = true;
1792   }
1793
1794   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1795      ir_instruction *ir = (ir_instruction *)iter.get();
1796
1797      this->base_ir = ir;
1798      ir->accept(this);
1799   }
1800
1801   if (ir->increment) {
1802      this->base_ir = ir->increment;
1803      ir->increment->accept(this);
1804      emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result));
1805   }
1806
1807   emit(fs_inst(BRW_OPCODE_WHILE));
1808}
1809
1810void
1811fs_visitor::visit(ir_loop_jump *ir)
1812{
1813   switch (ir->mode) {
1814   case ir_loop_jump::jump_break:
1815      emit(fs_inst(BRW_OPCODE_BREAK));
1816      break;
1817   case ir_loop_jump::jump_continue:
1818      emit(fs_inst(BRW_OPCODE_CONTINUE));
1819      break;
1820   }
1821}
1822
1823void
1824fs_visitor::visit(ir_call *ir)
1825{
1826   assert(!"FINISHME");
1827}
1828
1829void
1830fs_visitor::visit(ir_return *ir)
1831{
1832   assert(!"FINISHME");
1833}
1834
1835void
1836fs_visitor::visit(ir_function *ir)
1837{
1838   /* Ignore function bodies other than main() -- we shouldn't see calls to
1839    * them since they should all be inlined before we get to ir_to_mesa.
1840    */
1841   if (strcmp(ir->name, "main") == 0) {
1842      const ir_function_signature *sig;
1843      exec_list empty;
1844
1845      sig = ir->matching_signature(&empty);
1846
1847      assert(sig);
1848
1849      foreach_iter(exec_list_iterator, iter, sig->body) {
1850	 ir_instruction *ir = (ir_instruction *)iter.get();
1851	 this->base_ir = ir;
1852
1853	 ir->accept(this);
1854      }
1855   }
1856}
1857
1858void
1859fs_visitor::visit(ir_function_signature *ir)
1860{
1861   assert(!"not reached");
1862   (void)ir;
1863}
1864
1865fs_inst *
1866fs_visitor::emit(fs_inst inst)
1867{
1868   fs_inst *list_inst = new(mem_ctx) fs_inst;
1869   *list_inst = inst;
1870
1871   list_inst->annotation = this->current_annotation;
1872   list_inst->ir = this->base_ir;
1873
1874   this->instructions.push_tail(list_inst);
1875
1876   return list_inst;
1877}
1878
1879/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1880void
1881fs_visitor::emit_dummy_fs()
1882{
1883   /* Everyone's favorite color. */
1884   emit(fs_inst(BRW_OPCODE_MOV,
1885		fs_reg(MRF, 2),
1886		fs_reg(1.0f)));
1887   emit(fs_inst(BRW_OPCODE_MOV,
1888		fs_reg(MRF, 3),
1889		fs_reg(0.0f)));
1890   emit(fs_inst(BRW_OPCODE_MOV,
1891		fs_reg(MRF, 4),
1892		fs_reg(1.0f)));
1893   emit(fs_inst(BRW_OPCODE_MOV,
1894		fs_reg(MRF, 5),
1895		fs_reg(0.0f)));
1896
1897   fs_inst *write;
1898   write = emit(fs_inst(FS_OPCODE_FB_WRITE,
1899			fs_reg(0),
1900			fs_reg(0)));
1901   write->base_mrf = 0;
1902}
1903
1904/* The register location here is relative to the start of the URB
1905 * data.  It will get adjusted to be a real location before
1906 * generate_code() time.
1907 */
1908struct brw_reg
1909fs_visitor::interp_reg(int location, int channel)
1910{
1911   int regnr = urb_setup[location] * 2 + channel / 2;
1912   int stride = (channel & 1) * 4;
1913
1914   assert(urb_setup[location] != -1);
1915
1916   return brw_vec1_grf(regnr, stride);
1917}
1918
1919/** Emits the interpolation for the varying inputs. */
1920void
1921fs_visitor::emit_interpolation_setup_gen4()
1922{
1923   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1924
1925   this->current_annotation = "compute pixel centers";
1926   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1927   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1928   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1929   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1930   emit(fs_inst(BRW_OPCODE_ADD,
1931		this->pixel_x,
1932		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1933		fs_reg(brw_imm_v(0x10101010))));
1934   emit(fs_inst(BRW_OPCODE_ADD,
1935		this->pixel_y,
1936		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1937		fs_reg(brw_imm_v(0x11001100))));
1938
1939   this->current_annotation = "compute pixel deltas from v0";
1940   if (brw->has_pln) {
1941      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1942      this->delta_y = this->delta_x;
1943      this->delta_y.reg_offset++;
1944   } else {
1945      this->delta_x = fs_reg(this, glsl_type::float_type);
1946      this->delta_y = fs_reg(this, glsl_type::float_type);
1947   }
1948   emit(fs_inst(BRW_OPCODE_ADD,
1949		this->delta_x,
1950		this->pixel_x,
1951		fs_reg(negate(brw_vec1_grf(1, 0)))));
1952   emit(fs_inst(BRW_OPCODE_ADD,
1953		this->delta_y,
1954		this->pixel_y,
1955		fs_reg(negate(brw_vec1_grf(1, 1)))));
1956
1957   this->current_annotation = "compute pos.w and 1/pos.w";
1958   /* Compute wpos.w.  It's always in our setup, since it's needed to
1959    * interpolate the other attributes.
1960    */
1961   this->wpos_w = fs_reg(this, glsl_type::float_type);
1962   emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1963		interp_reg(FRAG_ATTRIB_WPOS, 3)));
1964   /* Compute the pixel 1/W value from wpos.w. */
1965   this->pixel_w = fs_reg(this, glsl_type::float_type);
1966   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1967   this->current_annotation = NULL;
1968}
1969
1970/** Emits the interpolation for the varying inputs. */
1971void
1972fs_visitor::emit_interpolation_setup_gen6()
1973{
1974   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1975
1976   /* If the pixel centers end up used, the setup is the same as for gen4. */
1977   this->current_annotation = "compute pixel centers";
1978   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1979   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1980   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1981   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1982   emit(fs_inst(BRW_OPCODE_ADD,
1983		int_pixel_x,
1984		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1985		fs_reg(brw_imm_v(0x10101010))));
1986   emit(fs_inst(BRW_OPCODE_ADD,
1987		int_pixel_y,
1988		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1989		fs_reg(brw_imm_v(0x11001100))));
1990
1991   /* As of gen6, we can no longer mix float and int sources.  We have
1992    * to turn the integer pixel centers into floats for their actual
1993    * use.
1994    */
1995   this->pixel_x = fs_reg(this, glsl_type::float_type);
1996   this->pixel_y = fs_reg(this, glsl_type::float_type);
1997   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x));
1998   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y));
1999
2000   this->current_annotation = "compute 1/pos.w";
2001   this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2002   this->pixel_w = fs_reg(this, glsl_type::float_type);
2003   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2004
2005   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2006   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2007
2008   this->current_annotation = NULL;
2009}
2010
2011void
2012fs_visitor::emit_fb_writes()
2013{
2014   this->current_annotation = "FB write header";
2015   GLboolean header_present = GL_TRUE;
2016   int nr = 0;
2017
2018   if (intel->gen >= 6 &&
2019       !this->kill_emitted &&
2020       c->key.nr_color_regions == 1) {
2021      header_present = false;
2022   }
2023
2024   if (header_present) {
2025      /* m0, m1 header */
2026      nr += 2;
2027   }
2028
2029   if (c->aa_dest_stencil_reg) {
2030      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2031		   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
2032   }
2033
2034   /* Reserve space for color. It'll be filled in per MRT below. */
2035   int color_mrf = nr;
2036   nr += 4;
2037
2038   if (c->source_depth_to_render_target) {
2039      if (c->computes_depth) {
2040	 /* Hand over gl_FragDepth. */
2041	 assert(this->frag_depth);
2042	 fs_reg depth = *(variable_storage(this->frag_depth));
2043
2044	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth));
2045      } else {
2046	 /* Pass through the payload depth. */
2047	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2048		      fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
2049      }
2050   }
2051
2052   if (c->dest_depth_reg) {
2053      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2054		   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
2055   }
2056
2057   fs_reg color = reg_undef;
2058   if (this->frag_color)
2059      color = *(variable_storage(this->frag_color));
2060   else if (this->frag_data) {
2061      color = *(variable_storage(this->frag_data));
2062      color.type = BRW_REGISTER_TYPE_F;
2063   }
2064
2065   for (int target = 0; target < c->key.nr_color_regions; target++) {
2066      this->current_annotation = talloc_asprintf(this->mem_ctx,
2067						 "FB write target %d",
2068						 target);
2069      if (this->frag_color || this->frag_data) {
2070	 for (int i = 0; i < 4; i++) {
2071	    emit(fs_inst(BRW_OPCODE_MOV,
2072			 fs_reg(MRF, color_mrf + i),
2073			 color));
2074	    color.reg_offset++;
2075	 }
2076      }
2077
2078      if (this->frag_color)
2079	 color.reg_offset -= 4;
2080
2081      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
2082				   reg_undef, reg_undef));
2083      inst->target = target;
2084      inst->base_mrf = 0;
2085      inst->mlen = nr;
2086      if (target == c->key.nr_color_regions - 1)
2087	 inst->eot = true;
2088      inst->header_present = header_present;
2089   }
2090
2091   if (c->key.nr_color_regions == 0) {
2092      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
2093				   reg_undef, reg_undef));
2094      inst->base_mrf = 0;
2095      inst->mlen = nr;
2096      inst->eot = true;
2097      inst->header_present = header_present;
2098   }
2099
2100   this->current_annotation = NULL;
2101}
2102
2103void
2104fs_visitor::generate_fb_write(fs_inst *inst)
2105{
2106   GLboolean eot = inst->eot;
2107   struct brw_reg implied_header;
2108
2109   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2110    * move, here's g1.
2111    */
2112   brw_push_insn_state(p);
2113   brw_set_mask_control(p, BRW_MASK_DISABLE);
2114   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2115
2116   if (inst->header_present) {
2117      if (intel->gen >= 6) {
2118	 brw_MOV(p,
2119		 brw_message_reg(inst->base_mrf),
2120		 brw_vec8_grf(0, 0));
2121
2122	 if (inst->target > 0) {
2123	    /* Set the render target index for choosing BLEND_STATE. */
2124	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2125			      BRW_REGISTER_TYPE_UD),
2126		    brw_imm_ud(inst->target));
2127	 }
2128
2129	 /* Clear viewport index, render target array index. */
2130	 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2131			   BRW_REGISTER_TYPE_UD),
2132		 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2133		 brw_imm_ud(0xf7ff));
2134
2135	 implied_header = brw_null_reg();
2136      } else {
2137	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2138      }
2139
2140      brw_MOV(p,
2141	      brw_message_reg(inst->base_mrf + 1),
2142	      brw_vec8_grf(1, 0));
2143   } else {
2144      implied_header = brw_null_reg();
2145   }
2146
2147   brw_pop_insn_state(p);
2148
2149   brw_fb_WRITE(p,
2150		8, /* dispatch_width */
2151		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
2152		inst->base_mrf,
2153		implied_header,
2154		inst->target,
2155		inst->mlen,
2156		0,
2157		eot,
2158		inst->header_present);
2159}
2160
2161void
2162fs_visitor::generate_linterp(fs_inst *inst,
2163			     struct brw_reg dst, struct brw_reg *src)
2164{
2165   struct brw_reg delta_x = src[0];
2166   struct brw_reg delta_y = src[1];
2167   struct brw_reg interp = src[2];
2168
2169   if (brw->has_pln &&
2170       delta_y.nr == delta_x.nr + 1 &&
2171       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2172      brw_PLN(p, dst, interp, delta_x);
2173   } else {
2174      brw_LINE(p, brw_null_reg(), interp, delta_x);
2175      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2176   }
2177}
2178
2179void
2180fs_visitor::generate_math(fs_inst *inst,
2181			  struct brw_reg dst, struct brw_reg *src)
2182{
2183   int op;
2184
2185   switch (inst->opcode) {
2186   case FS_OPCODE_RCP:
2187      op = BRW_MATH_FUNCTION_INV;
2188      break;
2189   case FS_OPCODE_RSQ:
2190      op = BRW_MATH_FUNCTION_RSQ;
2191      break;
2192   case FS_OPCODE_SQRT:
2193      op = BRW_MATH_FUNCTION_SQRT;
2194      break;
2195   case FS_OPCODE_EXP2:
2196      op = BRW_MATH_FUNCTION_EXP;
2197      break;
2198   case FS_OPCODE_LOG2:
2199      op = BRW_MATH_FUNCTION_LOG;
2200      break;
2201   case FS_OPCODE_POW:
2202      op = BRW_MATH_FUNCTION_POW;
2203      break;
2204   case FS_OPCODE_SIN:
2205      op = BRW_MATH_FUNCTION_SIN;
2206      break;
2207   case FS_OPCODE_COS:
2208      op = BRW_MATH_FUNCTION_COS;
2209      break;
2210   default:
2211      assert(!"not reached: unknown math function");
2212      op = 0;
2213      break;
2214   }
2215
2216   if (intel->gen >= 6) {
2217      assert(inst->mlen == 0);
2218
2219      if (inst->opcode == FS_OPCODE_POW) {
2220	 brw_math2(p, dst, op, src[0], src[1]);
2221      } else {
2222	 brw_math(p, dst,
2223		  op,
2224		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2225		  BRW_MATH_SATURATE_NONE,
2226		  0, src[0],
2227		  BRW_MATH_DATA_VECTOR,
2228		  BRW_MATH_PRECISION_FULL);
2229      }
2230   } else {
2231      assert(inst->mlen >= 1);
2232
2233      brw_math(p, dst,
2234	       op,
2235	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2236	       BRW_MATH_SATURATE_NONE,
2237	       inst->base_mrf, src[0],
2238	       BRW_MATH_DATA_VECTOR,
2239	       BRW_MATH_PRECISION_FULL);
2240   }
2241}
2242
2243void
2244fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst)
2245{
2246   int msg_type = -1;
2247   int rlen = 4;
2248   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2249
2250   if (intel->gen >= 5) {
2251      switch (inst->opcode) {
2252      case FS_OPCODE_TEX:
2253	 if (inst->shadow_compare) {
2254	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
2255	 } else {
2256	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
2257	 }
2258	 break;
2259      case FS_OPCODE_TXB:
2260	 if (inst->shadow_compare) {
2261	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE_GEN5;
2262	 } else {
2263	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
2264	 }
2265	 break;
2266      }
2267   } else {
2268      switch (inst->opcode) {
2269      case FS_OPCODE_TEX:
2270	 /* Note that G45 and older determines shadow compare and dispatch width
2271	  * from message length for most messages.
2272	  */
2273	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2274	 if (inst->shadow_compare) {
2275	    assert(inst->mlen == 6);
2276	 } else {
2277	    assert(inst->mlen <= 4);
2278	 }
2279	 break;
2280      case FS_OPCODE_TXB:
2281	 if (inst->shadow_compare) {
2282	    assert(inst->mlen == 6);
2283	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2284	 } else {
2285	    assert(inst->mlen == 9);
2286	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2287	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2288	 }
2289	 break;
2290      }
2291   }
2292   assert(msg_type != -1);
2293
2294   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2295      rlen = 8;
2296      dst = vec16(dst);
2297   }
2298
2299   brw_SAMPLE(p,
2300	      retype(dst, BRW_REGISTER_TYPE_UW),
2301	      inst->base_mrf,
2302	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
2303              SURF_INDEX_TEXTURE(inst->sampler),
2304	      inst->sampler,
2305	      WRITEMASK_XYZW,
2306	      msg_type,
2307	      rlen,
2308	      inst->mlen,
2309	      0,
2310	      1,
2311	      simd_mode);
2312}
2313
2314
2315/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2316 * looking like:
2317 *
2318 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2319 *
2320 * and we're trying to produce:
2321 *
2322 *           DDX                     DDY
2323 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2324 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2325 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2326 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2327 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2328 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2329 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2330 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2331 *
2332 * and add another set of two more subspans if in 16-pixel dispatch mode.
2333 *
2334 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2335 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2336 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2337 * between each other.  We could probably do it like ddx and swizzle the right
2338 * order later, but bail for now and just produce
2339 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2340 */
2341void
2342fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2343{
2344   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2345				 BRW_REGISTER_TYPE_F,
2346				 BRW_VERTICAL_STRIDE_2,
2347				 BRW_WIDTH_2,
2348				 BRW_HORIZONTAL_STRIDE_0,
2349				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2350   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2351				 BRW_REGISTER_TYPE_F,
2352				 BRW_VERTICAL_STRIDE_2,
2353				 BRW_WIDTH_2,
2354				 BRW_HORIZONTAL_STRIDE_0,
2355				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2356   brw_ADD(p, dst, src0, negate(src1));
2357}
2358
2359void
2360fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2361{
2362   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2363				 BRW_REGISTER_TYPE_F,
2364				 BRW_VERTICAL_STRIDE_4,
2365				 BRW_WIDTH_4,
2366				 BRW_HORIZONTAL_STRIDE_0,
2367				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2368   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2369				 BRW_REGISTER_TYPE_F,
2370				 BRW_VERTICAL_STRIDE_4,
2371				 BRW_WIDTH_4,
2372				 BRW_HORIZONTAL_STRIDE_0,
2373				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2374   brw_ADD(p, dst, src0, negate(src1));
2375}
2376
2377void
2378fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2379{
2380   if (intel->gen >= 6) {
2381      /* Gen6 no longer has the mask reg for us to just read the
2382       * active channels from.  However, cmp updates just the channels
2383       * of the flag reg that are enabled, so we can get at the
2384       * channel enables that way.  In this step, make a reg of ones
2385       * we'll compare to.
2386       */
2387      brw_MOV(p, mask, brw_imm_ud(1));
2388   } else {
2389      brw_push_insn_state(p);
2390      brw_set_mask_control(p, BRW_MASK_DISABLE);
2391      brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2392      brw_pop_insn_state(p);
2393   }
2394}
2395
2396void
2397fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2398{
2399   if (intel->gen >= 6) {
2400      struct brw_reg f0 = brw_flag_reg();
2401      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2402
2403      brw_push_insn_state(p);
2404      brw_set_mask_control(p, BRW_MASK_DISABLE);
2405      brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2406      brw_pop_insn_state(p);
2407
2408      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2409	      BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2410      /* Undo CMP's whacking of predication*/
2411      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2412
2413      brw_push_insn_state(p);
2414      brw_set_mask_control(p, BRW_MASK_DISABLE);
2415      brw_AND(p, g1, f0, g1);
2416      brw_pop_insn_state(p);
2417   } else {
2418      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2419
2420      mask = brw_uw1_reg(mask.file, mask.nr, 0);
2421
2422      brw_push_insn_state(p);
2423      brw_set_mask_control(p, BRW_MASK_DISABLE);
2424      brw_AND(p, g0, mask, g0);
2425      brw_pop_insn_state(p);
2426   }
2427}
2428
2429void
2430fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2431{
2432   assert(inst->mlen != 0);
2433
2434   brw_MOV(p,
2435	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2436	   retype(src, BRW_REGISTER_TYPE_UD));
2437   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2438				 inst->offset);
2439}
2440
2441void
2442fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2443{
2444   assert(inst->mlen != 0);
2445
2446   /* Clear any post destination dependencies that would be ignored by
2447    * the block read.  See the B-Spec for pre-gen5 send instruction.
2448    *
2449    * This could use a better solution, since texture sampling and
2450    * math reads could potentially run into it as well -- anywhere
2451    * that we have a SEND with a destination that is a register that
2452    * was written but not read within the last N instructions (what's
2453    * N?  unsure).  This is rare because of dead code elimination, but
2454    * not impossible.
2455    */
2456   if (intel->gen == 4 && !intel->is_g4x)
2457      brw_MOV(p, brw_null_reg(), dst);
2458
2459   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2460				inst->offset);
2461
2462   if (intel->gen == 4 && !intel->is_g4x) {
2463      /* gen4 errata: destination from a send can't be used as a
2464       * destination until it's been read.  Just read it so we don't
2465       * have to worry.
2466       */
2467      brw_MOV(p, brw_null_reg(), dst);
2468   }
2469}
2470
2471
2472void
2473fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2474{
2475   assert(inst->mlen != 0);
2476
2477   /* Clear any post destination dependencies that would be ignored by
2478    * the block read.  See the B-Spec for pre-gen5 send instruction.
2479    *
2480    * This could use a better solution, since texture sampling and
2481    * math reads could potentially run into it as well -- anywhere
2482    * that we have a SEND with a destination that is a register that
2483    * was written but not read within the last N instructions (what's
2484    * N?  unsure).  This is rare because of dead code elimination, but
2485    * not impossible.
2486    */
2487   if (intel->gen == 4 && !intel->is_g4x)
2488      brw_MOV(p, brw_null_reg(), dst);
2489
2490   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2491			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2492
2493   if (intel->gen == 4 && !intel->is_g4x) {
2494      /* gen4 errata: destination from a send can't be used as a
2495       * destination until it's been read.  Just read it so we don't
2496       * have to worry.
2497       */
2498      brw_MOV(p, brw_null_reg(), dst);
2499   }
2500}
2501
2502void
2503fs_visitor::assign_curb_setup()
2504{
2505   c->prog_data.first_curbe_grf = c->nr_payload_regs;
2506   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2507
2508   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2509   foreach_iter(exec_list_iterator, iter, this->instructions) {
2510      fs_inst *inst = (fs_inst *)iter.get();
2511
2512      for (unsigned int i = 0; i < 3; i++) {
2513	 if (inst->src[i].file == UNIFORM) {
2514	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2515	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2516						  constant_nr / 8,
2517						  constant_nr % 8);
2518
2519	    inst->src[i].file = FIXED_HW_REG;
2520	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2521	 }
2522      }
2523   }
2524}
2525
2526void
2527fs_visitor::calculate_urb_setup()
2528{
2529   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2530      urb_setup[i] = -1;
2531   }
2532
2533   int urb_next = 0;
2534   /* Figure out where each of the incoming setup attributes lands. */
2535   if (intel->gen >= 6) {
2536      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2537	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2538	    urb_setup[i] = urb_next++;
2539	 }
2540      }
2541   } else {
2542      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2543      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2544	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2545	    int fp_index;
2546
2547	    if (i >= VERT_RESULT_VAR0)
2548	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2549	    else if (i <= VERT_RESULT_TEX7)
2550	       fp_index = i;
2551	    else
2552	       fp_index = -1;
2553
2554	    if (fp_index >= 0)
2555	       urb_setup[fp_index] = urb_next++;
2556	 }
2557      }
2558   }
2559
2560   /* Each attribute is 4 setup channels, each of which is half a reg. */
2561   c->prog_data.urb_read_length = urb_next * 2;
2562}
2563
2564void
2565fs_visitor::assign_urb_setup()
2566{
2567   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2568
2569   /* Offset all the urb_setup[] index by the actual position of the
2570    * setup regs, now that the location of the constants has been chosen.
2571    */
2572   foreach_iter(exec_list_iterator, iter, this->instructions) {
2573      fs_inst *inst = (fs_inst *)iter.get();
2574
2575      if (inst->opcode == FS_OPCODE_LINTERP) {
2576	 assert(inst->src[2].file == FIXED_HW_REG);
2577	 inst->src[2].fixed_hw_reg.nr += urb_start;
2578      }
2579
2580      if (inst->opcode == FS_OPCODE_CINTERP) {
2581	 assert(inst->src[0].file == FIXED_HW_REG);
2582	 inst->src[0].fixed_hw_reg.nr += urb_start;
2583      }
2584   }
2585
2586   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2587}
2588
2589/**
2590 * Split large virtual GRFs into separate components if we can.
2591 *
2592 * This is mostly duplicated with what brw_fs_vector_splitting does,
2593 * but that's really conservative because it's afraid of doing
2594 * splitting that doesn't result in real progress after the rest of
2595 * the optimization phases, which would cause infinite looping in
2596 * optimization.  We can do it once here, safely.  This also has the
2597 * opportunity to split interpolated values, or maybe even uniforms,
2598 * which we don't have at the IR level.
2599 *
2600 * We want to split, because virtual GRFs are what we register
2601 * allocate and spill (due to contiguousness requirements for some
2602 * instructions), and they're what we naturally generate in the
2603 * codegen process, but most virtual GRFs don't actually need to be
2604 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2605 * live intervals and better dead code elimination and coalescing.
2606 */
2607void
2608fs_visitor::split_virtual_grfs()
2609{
2610   int num_vars = this->virtual_grf_next;
2611   bool split_grf[num_vars];
2612   int new_virtual_grf[num_vars];
2613
2614   /* Try to split anything > 0 sized. */
2615   for (int i = 0; i < num_vars; i++) {
2616      if (this->virtual_grf_sizes[i] != 1)
2617	 split_grf[i] = true;
2618      else
2619	 split_grf[i] = false;
2620   }
2621
2622   if (brw->has_pln) {
2623      /* PLN opcodes rely on the delta_xy being contiguous. */
2624      split_grf[this->delta_x.reg] = false;
2625   }
2626
2627   foreach_iter(exec_list_iterator, iter, this->instructions) {
2628      fs_inst *inst = (fs_inst *)iter.get();
2629
2630      /* Texturing produces 4 contiguous registers, so no splitting. */
2631      if ((inst->opcode == FS_OPCODE_TEX ||
2632	   inst->opcode == FS_OPCODE_TXB ||
2633	   inst->opcode == FS_OPCODE_TXL) &&
2634	  inst->dst.file == GRF) {
2635	 split_grf[inst->dst.reg] = false;
2636      }
2637   }
2638
2639   /* Allocate new space for split regs.  Note that the virtual
2640    * numbers will be contiguous.
2641    */
2642   for (int i = 0; i < num_vars; i++) {
2643      if (split_grf[i]) {
2644	 new_virtual_grf[i] = virtual_grf_alloc(1);
2645	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2646	    int reg = virtual_grf_alloc(1);
2647	    assert(reg == new_virtual_grf[i] + j - 1);
2648	    (void) reg;
2649	 }
2650	 this->virtual_grf_sizes[i] = 1;
2651      }
2652   }
2653
2654   foreach_iter(exec_list_iterator, iter, this->instructions) {
2655      fs_inst *inst = (fs_inst *)iter.get();
2656
2657      if (inst->dst.file == GRF &&
2658	  split_grf[inst->dst.reg] &&
2659	  inst->dst.reg_offset != 0) {
2660	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2661			  inst->dst.reg_offset - 1);
2662	 inst->dst.reg_offset = 0;
2663      }
2664      for (int i = 0; i < 3; i++) {
2665	 if (inst->src[i].file == GRF &&
2666	     split_grf[inst->src[i].reg] &&
2667	     inst->src[i].reg_offset != 0) {
2668	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2669				inst->src[i].reg_offset - 1);
2670	    inst->src[i].reg_offset = 0;
2671	 }
2672      }
2673   }
2674   this->live_intervals_valid = false;
2675}
2676
2677/**
2678 * Choose accesses from the UNIFORM file to demote to using the pull
2679 * constant buffer.
2680 *
2681 * We allow a fragment shader to have more than the specified minimum
2682 * maximum number of fragment shader uniform components (64).  If
2683 * there are too many of these, they'd fill up all of register space.
2684 * So, this will push some of them out to the pull constant buffer and
2685 * update the program to load them.
2686 */
2687void
2688fs_visitor::setup_pull_constants()
2689{
2690   /* Only allow 16 registers (128 uniform components) as push constants. */
2691   unsigned int max_uniform_components = 16 * 8;
2692   if (c->prog_data.nr_params <= max_uniform_components)
2693      return;
2694
2695   /* Just demote the end of the list.  We could probably do better
2696    * here, demoting things that are rarely used in the program first.
2697    */
2698   int pull_uniform_base = max_uniform_components;
2699   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2700
2701   foreach_iter(exec_list_iterator, iter, this->instructions) {
2702      fs_inst *inst = (fs_inst *)iter.get();
2703
2704      for (int i = 0; i < 3; i++) {
2705	 if (inst->src[i].file != UNIFORM)
2706	    continue;
2707
2708	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2709	 if (uniform_nr < pull_uniform_base)
2710	    continue;
2711
2712	 fs_reg dst = fs_reg(this, glsl_type::float_type);
2713	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2714					      dst);
2715	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2716	 pull->ir = inst->ir;
2717	 pull->annotation = inst->annotation;
2718	 pull->base_mrf = 14;
2719	 pull->mlen = 1;
2720
2721	 inst->insert_before(pull);
2722
2723	 inst->src[i].file = GRF;
2724	 inst->src[i].reg = dst.reg;
2725	 inst->src[i].reg_offset = 0;
2726	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2727      }
2728   }
2729
2730   for (int i = 0; i < pull_uniform_count; i++) {
2731      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2732      c->prog_data.pull_param_convert[i] =
2733	 c->prog_data.param_convert[pull_uniform_base + i];
2734   }
2735   c->prog_data.nr_params -= pull_uniform_count;
2736   c->prog_data.nr_pull_params = pull_uniform_count;
2737}
2738
2739void
2740fs_visitor::calculate_live_intervals()
2741{
2742   int num_vars = this->virtual_grf_next;
2743   int *def = talloc_array(mem_ctx, int, num_vars);
2744   int *use = talloc_array(mem_ctx, int, num_vars);
2745   int loop_depth = 0;
2746   int loop_start = 0;
2747   int bb_header_ip = 0;
2748
2749   if (this->live_intervals_valid)
2750      return;
2751
2752   for (int i = 0; i < num_vars; i++) {
2753      def[i] = 1 << 30;
2754      use[i] = -1;
2755   }
2756
2757   int ip = 0;
2758   foreach_iter(exec_list_iterator, iter, this->instructions) {
2759      fs_inst *inst = (fs_inst *)iter.get();
2760
2761      if (inst->opcode == BRW_OPCODE_DO) {
2762	 if (loop_depth++ == 0)
2763	    loop_start = ip;
2764      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2765	 loop_depth--;
2766
2767	 if (loop_depth == 0) {
2768	    /* Patches up the use of vars marked for being live across
2769	     * the whole loop.
2770	     */
2771	    for (int i = 0; i < num_vars; i++) {
2772	       if (use[i] == loop_start) {
2773		  use[i] = ip;
2774	       }
2775	    }
2776	 }
2777      } else {
2778	 for (unsigned int i = 0; i < 3; i++) {
2779	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2780	       int reg = inst->src[i].reg;
2781
2782	       if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2783				   def[reg] >= bb_header_ip)) {
2784		  use[reg] = ip;
2785	       } else {
2786		  def[reg] = MIN2(loop_start, def[reg]);
2787		  use[reg] = loop_start;
2788
2789		  /* Nobody else is going to go smash our start to
2790		   * later in the loop now, because def[reg] now
2791		   * points before the bb header.
2792		   */
2793	       }
2794	    }
2795	 }
2796	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2797	    int reg = inst->dst.reg;
2798
2799	    if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2800				!inst->predicated)) {
2801	       def[reg] = MIN2(def[reg], ip);
2802	    } else {
2803	       def[reg] = MIN2(def[reg], loop_start);
2804	    }
2805	 }
2806      }
2807
2808      ip++;
2809
2810      /* Set the basic block header IP.  This is used for determining
2811       * if a complete def of single-register virtual GRF in a loop
2812       * dominates a use in the same basic block.  It's a quick way to
2813       * reduce the live interval range of most register used in a
2814       * loop.
2815       */
2816      if (inst->opcode == BRW_OPCODE_IF ||
2817	  inst->opcode == BRW_OPCODE_ELSE ||
2818	  inst->opcode == BRW_OPCODE_ENDIF ||
2819	  inst->opcode == BRW_OPCODE_DO ||
2820	  inst->opcode == BRW_OPCODE_WHILE ||
2821	  inst->opcode == BRW_OPCODE_BREAK ||
2822	  inst->opcode == BRW_OPCODE_CONTINUE) {
2823	 bb_header_ip = ip;
2824      }
2825   }
2826
2827   talloc_free(this->virtual_grf_def);
2828   talloc_free(this->virtual_grf_use);
2829   this->virtual_grf_def = def;
2830   this->virtual_grf_use = use;
2831
2832   this->live_intervals_valid = true;
2833}
2834
2835/**
2836 * Attempts to move immediate constants into the immediate
2837 * constant slot of following instructions.
2838 *
2839 * Immediate constants are a bit tricky -- they have to be in the last
2840 * operand slot, you can't do abs/negate on them,
2841 */
2842
2843bool
2844fs_visitor::propagate_constants()
2845{
2846   bool progress = false;
2847
2848   calculate_live_intervals();
2849
2850   foreach_iter(exec_list_iterator, iter, this->instructions) {
2851      fs_inst *inst = (fs_inst *)iter.get();
2852
2853      if (inst->opcode != BRW_OPCODE_MOV ||
2854	  inst->predicated ||
2855	  inst->dst.file != GRF || inst->src[0].file != IMM ||
2856	  inst->dst.type != inst->src[0].type)
2857	 continue;
2858
2859      /* Don't bother with cases where we should have had the
2860       * operation on the constant folded in GLSL already.
2861       */
2862      if (inst->saturate)
2863	 continue;
2864
2865      /* Found a move of a constant to a GRF.  Find anything else using the GRF
2866       * before it's written, and replace it with the constant if we can.
2867       */
2868      exec_list_iterator scan_iter = iter;
2869      scan_iter.next();
2870      for (; scan_iter.has_next(); scan_iter.next()) {
2871	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2872
2873	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2874	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2875	     scan_inst->opcode == BRW_OPCODE_ELSE ||
2876	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2877	    break;
2878	 }
2879
2880	 for (int i = 2; i >= 0; i--) {
2881	    if (scan_inst->src[i].file != GRF ||
2882		scan_inst->src[i].reg != inst->dst.reg ||
2883		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2884	       continue;
2885
2886	    /* Don't bother with cases where we should have had the
2887	     * operation on the constant folded in GLSL already.
2888	     */
2889	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2890	       continue;
2891
2892	    switch (scan_inst->opcode) {
2893	    case BRW_OPCODE_MOV:
2894	       scan_inst->src[i] = inst->src[0];
2895	       progress = true;
2896	       break;
2897
2898	    case BRW_OPCODE_MUL:
2899	    case BRW_OPCODE_ADD:
2900	       if (i == 1) {
2901		  scan_inst->src[i] = inst->src[0];
2902		  progress = true;
2903	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
2904		  /* Fit this constant in by commuting the operands */
2905		  scan_inst->src[0] = scan_inst->src[1];
2906		  scan_inst->src[1] = inst->src[0];
2907		  progress = true;
2908	       }
2909	       break;
2910	    case BRW_OPCODE_CMP:
2911	    case BRW_OPCODE_SEL:
2912	       if (i == 1) {
2913		  scan_inst->src[i] = inst->src[0];
2914		  progress = true;
2915	       }
2916	    }
2917	 }
2918
2919	 if (scan_inst->dst.file == GRF &&
2920	     scan_inst->dst.reg == inst->dst.reg &&
2921	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2922	      scan_inst->opcode == FS_OPCODE_TEX)) {
2923	    break;
2924	 }
2925      }
2926   }
2927
2928   if (progress)
2929       this->live_intervals_valid = false;
2930
2931   return progress;
2932}
2933/**
2934 * Must be called after calculate_live_intervales() to remove unused
2935 * writes to registers -- register allocation will fail otherwise
2936 * because something deffed but not used won't be considered to
2937 * interfere with other regs.
2938 */
2939bool
2940fs_visitor::dead_code_eliminate()
2941{
2942   bool progress = false;
2943   int pc = 0;
2944
2945   calculate_live_intervals();
2946
2947   foreach_iter(exec_list_iterator, iter, this->instructions) {
2948      fs_inst *inst = (fs_inst *)iter.get();
2949
2950      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
2951	 inst->remove();
2952	 progress = true;
2953      }
2954
2955      pc++;
2956   }
2957
2958   if (progress)
2959      live_intervals_valid = false;
2960
2961   return progress;
2962}
2963
2964bool
2965fs_visitor::register_coalesce()
2966{
2967   bool progress = false;
2968   int if_depth = 0;
2969   int loop_depth = 0;
2970
2971   foreach_iter(exec_list_iterator, iter, this->instructions) {
2972      fs_inst *inst = (fs_inst *)iter.get();
2973
2974      /* Make sure that we dominate the instructions we're going to
2975       * scan for interfering with our coalescing, or we won't have
2976       * scanned enough to see if anything interferes with our
2977       * coalescing.  We don't dominate the following instructions if
2978       * we're in a loop or an if block.
2979       */
2980      switch (inst->opcode) {
2981      case BRW_OPCODE_DO:
2982	 loop_depth++;
2983	 break;
2984      case BRW_OPCODE_WHILE:
2985	 loop_depth--;
2986	 break;
2987      case BRW_OPCODE_IF:
2988	 if_depth++;
2989	 break;
2990      case BRW_OPCODE_ENDIF:
2991	 if_depth--;
2992	 break;
2993      }
2994      if (loop_depth || if_depth)
2995	 continue;
2996
2997      if (inst->opcode != BRW_OPCODE_MOV ||
2998	  inst->predicated ||
2999	  inst->saturate ||
3000	  inst->dst.file != GRF || inst->src[0].file != GRF ||
3001	  inst->dst.type != inst->src[0].type)
3002	 continue;
3003
3004      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
3005       * them: check for no writes to either one until the exit of the
3006       * program.
3007       */
3008      bool interfered = false;
3009      exec_list_iterator scan_iter = iter;
3010      scan_iter.next();
3011      for (; scan_iter.has_next(); scan_iter.next()) {
3012	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3013
3014	 if (scan_inst->dst.file == GRF) {
3015	    if (scan_inst->dst.reg == inst->dst.reg &&
3016		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3017		 scan_inst->opcode == FS_OPCODE_TEX)) {
3018	       interfered = true;
3019	       break;
3020	    }
3021	    if (scan_inst->dst.reg == inst->src[0].reg &&
3022		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3023		 scan_inst->opcode == FS_OPCODE_TEX)) {
3024	       interfered = true;
3025	       break;
3026	    }
3027	 }
3028      }
3029      if (interfered) {
3030	 continue;
3031      }
3032
3033      /* Rewrite the later usage to point at the source of the move to
3034       * be removed.
3035       */
3036      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3037	   scan_iter.next()) {
3038	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3039
3040	 for (int i = 0; i < 3; i++) {
3041	    if (scan_inst->src[i].file == GRF &&
3042		scan_inst->src[i].reg == inst->dst.reg &&
3043		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3044	       scan_inst->src[i].reg = inst->src[0].reg;
3045	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3046	       scan_inst->src[i].abs |= inst->src[0].abs;
3047	       scan_inst->src[i].negate ^= inst->src[0].negate;
3048	       scan_inst->src[i].smear = inst->src[0].smear;
3049	    }
3050	 }
3051      }
3052
3053      inst->remove();
3054      progress = true;
3055   }
3056
3057   if (progress)
3058      live_intervals_valid = false;
3059
3060   return progress;
3061}
3062
3063
3064bool
3065fs_visitor::compute_to_mrf()
3066{
3067   bool progress = false;
3068   int next_ip = 0;
3069
3070   calculate_live_intervals();
3071
3072   foreach_iter(exec_list_iterator, iter, this->instructions) {
3073      fs_inst *inst = (fs_inst *)iter.get();
3074
3075      int ip = next_ip;
3076      next_ip++;
3077
3078      if (inst->opcode != BRW_OPCODE_MOV ||
3079	  inst->predicated ||
3080	  inst->dst.file != MRF || inst->src[0].file != GRF ||
3081	  inst->dst.type != inst->src[0].type ||
3082	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3083	 continue;
3084
3085      /* Can't compute-to-MRF this GRF if someone else was going to
3086       * read it later.
3087       */
3088      if (this->virtual_grf_use[inst->src[0].reg] > ip)
3089	 continue;
3090
3091      /* Found a move of a GRF to a MRF.  Let's see if we can go
3092       * rewrite the thing that made this GRF to write into the MRF.
3093       */
3094      fs_inst *scan_inst;
3095      for (scan_inst = (fs_inst *)inst->prev;
3096	   scan_inst->prev != NULL;
3097	   scan_inst = (fs_inst *)scan_inst->prev) {
3098	 if (scan_inst->dst.file == GRF &&
3099	     scan_inst->dst.reg == inst->src[0].reg) {
3100	    /* Found the last thing to write our reg we want to turn
3101	     * into a compute-to-MRF.
3102	     */
3103
3104	    if (scan_inst->opcode == FS_OPCODE_TEX) {
3105	       /* texturing writes several continuous regs, so we can't
3106		* compute-to-mrf that.
3107		*/
3108	       break;
3109	    }
3110
3111	    /* If it's predicated, it (probably) didn't populate all
3112	     * the channels.
3113	     */
3114	    if (scan_inst->predicated)
3115	       break;
3116
3117	    /* SEND instructions can't have MRF as a destination. */
3118	    if (scan_inst->mlen)
3119	       break;
3120
3121	    if (intel->gen >= 6) {
3122	       /* gen6 math instructions must have the destination be
3123		* GRF, so no compute-to-MRF for them.
3124		*/
3125	       if (scan_inst->opcode == FS_OPCODE_RCP ||
3126		   scan_inst->opcode == FS_OPCODE_RSQ ||
3127		   scan_inst->opcode == FS_OPCODE_SQRT ||
3128		   scan_inst->opcode == FS_OPCODE_EXP2 ||
3129		   scan_inst->opcode == FS_OPCODE_LOG2 ||
3130		   scan_inst->opcode == FS_OPCODE_SIN ||
3131		   scan_inst->opcode == FS_OPCODE_COS ||
3132		   scan_inst->opcode == FS_OPCODE_POW) {
3133		  break;
3134	       }
3135	    }
3136
3137	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3138	       /* Found the creator of our MRF's source value. */
3139	       scan_inst->dst.file = MRF;
3140	       scan_inst->dst.hw_reg = inst->dst.hw_reg;
3141	       scan_inst->saturate |= inst->saturate;
3142	       inst->remove();
3143	       progress = true;
3144	    }
3145	    break;
3146	 }
3147
3148	 /* We don't handle flow control here.  Most computation of
3149	  * values that end up in MRFs are shortly before the MRF
3150	  * write anyway.
3151	  */
3152	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3153	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3154	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3155	    break;
3156	 }
3157
3158	 /* You can't read from an MRF, so if someone else reads our
3159	  * MRF's source GRF that we wanted to rewrite, that stops us.
3160	  */
3161	 bool interfered = false;
3162	 for (int i = 0; i < 3; i++) {
3163	    if (scan_inst->src[i].file == GRF &&
3164		scan_inst->src[i].reg == inst->src[0].reg &&
3165		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3166	       interfered = true;
3167	    }
3168	 }
3169	 if (interfered)
3170	    break;
3171
3172	 if (scan_inst->dst.file == MRF &&
3173	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3174	    /* Somebody else wrote our MRF here, so we can't can't
3175	     * compute-to-MRF before that.
3176	     */
3177	    break;
3178	 }
3179
3180	 if (scan_inst->mlen > 0) {
3181	    /* Found a SEND instruction, which means that there are
3182	     * live values in MRFs from base_mrf to base_mrf +
3183	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3184	     * above it.
3185	     */
3186	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3187		inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3188	       break;
3189	    }
3190	 }
3191      }
3192   }
3193
3194   return progress;
3195}
3196
3197/**
3198 * Walks through basic blocks, locking for repeated MRF writes and
3199 * removing the later ones.
3200 */
3201bool
3202fs_visitor::remove_duplicate_mrf_writes()
3203{
3204   fs_inst *last_mrf_move[16];
3205   bool progress = false;
3206
3207   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3208
3209   foreach_iter(exec_list_iterator, iter, this->instructions) {
3210      fs_inst *inst = (fs_inst *)iter.get();
3211
3212      switch (inst->opcode) {
3213      case BRW_OPCODE_DO:
3214      case BRW_OPCODE_WHILE:
3215      case BRW_OPCODE_IF:
3216      case BRW_OPCODE_ELSE:
3217      case BRW_OPCODE_ENDIF:
3218	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3219	 continue;
3220      default:
3221	 break;
3222      }
3223
3224      if (inst->opcode == BRW_OPCODE_MOV &&
3225	  inst->dst.file == MRF) {
3226	 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3227	 if (prev_inst && inst->equals(prev_inst)) {
3228	    inst->remove();
3229	    progress = true;
3230	    continue;
3231	 }
3232      }
3233
3234      /* Clear out the last-write records for MRFs that were overwritten. */
3235      if (inst->dst.file == MRF) {
3236	 last_mrf_move[inst->dst.hw_reg] = NULL;
3237      }
3238
3239      if (inst->mlen > 0) {
3240	 /* Found a SEND instruction, which will include two of fewer
3241	  * implied MRF writes.  We could do better here.
3242	  */
3243	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3244	    last_mrf_move[inst->base_mrf + i] = NULL;
3245	 }
3246      }
3247
3248      /* Clear out any MRF move records whose sources got overwritten. */
3249      if (inst->dst.file == GRF) {
3250	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3251	    if (last_mrf_move[i] &&
3252		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3253	       last_mrf_move[i] = NULL;
3254	    }
3255	 }
3256      }
3257
3258      if (inst->opcode == BRW_OPCODE_MOV &&
3259	  inst->dst.file == MRF &&
3260	  inst->src[0].file == GRF &&
3261	  !inst->predicated) {
3262	 last_mrf_move[inst->dst.hw_reg] = inst;
3263      }
3264   }
3265
3266   return progress;
3267}
3268
3269bool
3270fs_visitor::virtual_grf_interferes(int a, int b)
3271{
3272   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3273   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3274
3275   /* For dead code, just check if the def interferes with the other range. */
3276   if (this->virtual_grf_use[a] == -1) {
3277      return (this->virtual_grf_def[a] >= this->virtual_grf_def[b] &&
3278	      this->virtual_grf_def[a] < this->virtual_grf_use[b]);
3279   }
3280   if (this->virtual_grf_use[b] == -1) {
3281      return (this->virtual_grf_def[b] >= this->virtual_grf_def[a] &&
3282	      this->virtual_grf_def[b] < this->virtual_grf_use[a]);
3283   }
3284
3285   return start < end;
3286}
3287
3288static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3289{
3290   struct brw_reg brw_reg;
3291
3292   switch (reg->file) {
3293   case GRF:
3294   case ARF:
3295   case MRF:
3296      if (reg->smear == -1) {
3297	 brw_reg = brw_vec8_reg(reg->file,
3298				reg->hw_reg, 0);
3299      } else {
3300	 brw_reg = brw_vec1_reg(reg->file,
3301				reg->hw_reg, reg->smear);
3302      }
3303      brw_reg = retype(brw_reg, reg->type);
3304      break;
3305   case IMM:
3306      switch (reg->type) {
3307      case BRW_REGISTER_TYPE_F:
3308	 brw_reg = brw_imm_f(reg->imm.f);
3309	 break;
3310      case BRW_REGISTER_TYPE_D:
3311	 brw_reg = brw_imm_d(reg->imm.i);
3312	 break;
3313      case BRW_REGISTER_TYPE_UD:
3314	 brw_reg = brw_imm_ud(reg->imm.u);
3315	 break;
3316      default:
3317	 assert(!"not reached");
3318	 brw_reg = brw_null_reg();
3319	 break;
3320      }
3321      break;
3322   case FIXED_HW_REG:
3323      brw_reg = reg->fixed_hw_reg;
3324      break;
3325   case BAD_FILE:
3326      /* Probably unused. */
3327      brw_reg = brw_null_reg();
3328      break;
3329   case UNIFORM:
3330      assert(!"not reached");
3331      brw_reg = brw_null_reg();
3332      break;
3333   default:
3334      assert(!"not reached");
3335      brw_reg = brw_null_reg();
3336      break;
3337   }
3338   if (reg->abs)
3339      brw_reg = brw_abs(brw_reg);
3340   if (reg->negate)
3341      brw_reg = negate(brw_reg);
3342
3343   return brw_reg;
3344}
3345
3346void
3347fs_visitor::generate_code()
3348{
3349   int last_native_inst = 0;
3350   struct brw_instruction *if_stack[16], *loop_stack[16];
3351   int if_stack_depth = 0, loop_stack_depth = 0;
3352   int if_depth_in_loop[16];
3353   const char *last_annotation_string = NULL;
3354   ir_instruction *last_annotation_ir = NULL;
3355
3356   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3357      printf("Native code for fragment shader %d:\n",
3358	     ctx->Shader.CurrentFragmentProgram->Name);
3359   }
3360
3361   if_depth_in_loop[loop_stack_depth] = 0;
3362
3363   memset(&if_stack, 0, sizeof(if_stack));
3364   foreach_iter(exec_list_iterator, iter, this->instructions) {
3365      fs_inst *inst = (fs_inst *)iter.get();
3366      struct brw_reg src[3], dst;
3367
3368      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3369	 if (last_annotation_ir != inst->ir) {
3370	    last_annotation_ir = inst->ir;
3371	    if (last_annotation_ir) {
3372	       printf("   ");
3373	       last_annotation_ir->print();
3374	       printf("\n");
3375	    }
3376	 }
3377	 if (last_annotation_string != inst->annotation) {
3378	    last_annotation_string = inst->annotation;
3379	    if (last_annotation_string)
3380	       printf("   %s\n", last_annotation_string);
3381	 }
3382      }
3383
3384      for (unsigned int i = 0; i < 3; i++) {
3385	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3386      }
3387      dst = brw_reg_from_fs_reg(&inst->dst);
3388
3389      brw_set_conditionalmod(p, inst->conditional_mod);
3390      brw_set_predicate_control(p, inst->predicated);
3391      brw_set_saturate(p, inst->saturate);
3392
3393      switch (inst->opcode) {
3394      case BRW_OPCODE_MOV:
3395	 brw_MOV(p, dst, src[0]);
3396	 break;
3397      case BRW_OPCODE_ADD:
3398	 brw_ADD(p, dst, src[0], src[1]);
3399	 break;
3400      case BRW_OPCODE_MUL:
3401	 brw_MUL(p, dst, src[0], src[1]);
3402	 break;
3403
3404      case BRW_OPCODE_FRC:
3405	 brw_FRC(p, dst, src[0]);
3406	 break;
3407      case BRW_OPCODE_RNDD:
3408	 brw_RNDD(p, dst, src[0]);
3409	 break;
3410      case BRW_OPCODE_RNDE:
3411	 brw_RNDE(p, dst, src[0]);
3412	 break;
3413      case BRW_OPCODE_RNDZ:
3414	 brw_RNDZ(p, dst, src[0]);
3415	 break;
3416
3417      case BRW_OPCODE_AND:
3418	 brw_AND(p, dst, src[0], src[1]);
3419	 break;
3420      case BRW_OPCODE_OR:
3421	 brw_OR(p, dst, src[0], src[1]);
3422	 break;
3423      case BRW_OPCODE_XOR:
3424	 brw_XOR(p, dst, src[0], src[1]);
3425	 break;
3426      case BRW_OPCODE_NOT:
3427	 brw_NOT(p, dst, src[0]);
3428	 break;
3429      case BRW_OPCODE_ASR:
3430	 brw_ASR(p, dst, src[0], src[1]);
3431	 break;
3432      case BRW_OPCODE_SHR:
3433	 brw_SHR(p, dst, src[0], src[1]);
3434	 break;
3435      case BRW_OPCODE_SHL:
3436	 brw_SHL(p, dst, src[0], src[1]);
3437	 break;
3438
3439      case BRW_OPCODE_CMP:
3440	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3441	 break;
3442      case BRW_OPCODE_SEL:
3443	 brw_SEL(p, dst, src[0], src[1]);
3444	 break;
3445
3446      case BRW_OPCODE_IF:
3447	 assert(if_stack_depth < 16);
3448	 if (inst->src[0].file != BAD_FILE) {
3449	    assert(intel->gen >= 6);
3450	    if_stack[if_stack_depth] = brw_IF_gen6(p, inst->conditional_mod, src[0], src[1]);
3451	 } else {
3452	    if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3453	 }
3454	 if_depth_in_loop[loop_stack_depth]++;
3455	 if_stack_depth++;
3456	 break;
3457
3458      case BRW_OPCODE_ELSE:
3459	 if_stack[if_stack_depth - 1] =
3460	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
3461	 break;
3462      case BRW_OPCODE_ENDIF:
3463	 if_stack_depth--;
3464	 brw_ENDIF(p , if_stack[if_stack_depth]);
3465	 if_depth_in_loop[loop_stack_depth]--;
3466	 break;
3467
3468      case BRW_OPCODE_DO:
3469	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3470	 if_depth_in_loop[loop_stack_depth] = 0;
3471	 break;
3472
3473      case BRW_OPCODE_BREAK:
3474	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3475	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3476	 break;
3477      case BRW_OPCODE_CONTINUE:
3478	 /* FINISHME: We need to write the loop instruction support still. */
3479	 if (intel->gen >= 6)
3480	    brw_CONT_gen6(p, loop_stack[loop_stack_depth - 1]);
3481	 else
3482	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3483	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3484	 break;
3485
3486      case BRW_OPCODE_WHILE: {
3487	 struct brw_instruction *inst0, *inst1;
3488	 GLuint br = 1;
3489
3490	 if (intel->gen >= 5)
3491	    br = 2;
3492
3493	 assert(loop_stack_depth > 0);
3494	 loop_stack_depth--;
3495	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3496	 if (intel->gen < 6) {
3497	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
3498	    while (inst0 > loop_stack[loop_stack_depth]) {
3499	       inst0--;
3500	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3501		   inst0->bits3.if_else.jump_count == 0) {
3502		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3503	    }
3504	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3505			inst0->bits3.if_else.jump_count == 0) {
3506		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3507	       }
3508	    }
3509	 }
3510      }
3511	 break;
3512
3513      case FS_OPCODE_RCP:
3514      case FS_OPCODE_RSQ:
3515      case FS_OPCODE_SQRT:
3516      case FS_OPCODE_EXP2:
3517      case FS_OPCODE_LOG2:
3518      case FS_OPCODE_POW:
3519      case FS_OPCODE_SIN:
3520      case FS_OPCODE_COS:
3521	 generate_math(inst, dst, src);
3522	 break;
3523      case FS_OPCODE_CINTERP:
3524	 brw_MOV(p, dst, src[0]);
3525	 break;
3526      case FS_OPCODE_LINTERP:
3527	 generate_linterp(inst, dst, src);
3528	 break;
3529      case FS_OPCODE_TEX:
3530      case FS_OPCODE_TXB:
3531      case FS_OPCODE_TXL:
3532	 generate_tex(inst, dst);
3533	 break;
3534      case FS_OPCODE_DISCARD_NOT:
3535	 generate_discard_not(inst, dst);
3536	 break;
3537      case FS_OPCODE_DISCARD_AND:
3538	 generate_discard_and(inst, src[0]);
3539	 break;
3540      case FS_OPCODE_DDX:
3541	 generate_ddx(inst, dst, src[0]);
3542	 break;
3543      case FS_OPCODE_DDY:
3544	 generate_ddy(inst, dst, src[0]);
3545	 break;
3546
3547      case FS_OPCODE_SPILL:
3548	 generate_spill(inst, src[0]);
3549	 break;
3550
3551      case FS_OPCODE_UNSPILL:
3552	 generate_unspill(inst, dst);
3553	 break;
3554
3555      case FS_OPCODE_PULL_CONSTANT_LOAD:
3556	 generate_pull_constant_load(inst, dst);
3557	 break;
3558
3559      case FS_OPCODE_FB_WRITE:
3560	 generate_fb_write(inst);
3561	 break;
3562      default:
3563	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3564	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3565			  brw_opcodes[inst->opcode].name);
3566	 } else {
3567	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3568	 }
3569	 this->fail = true;
3570      }
3571
3572      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3573	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3574	    if (0) {
3575	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3576		      ((uint32_t *)&p->store[i])[3],
3577		      ((uint32_t *)&p->store[i])[2],
3578		      ((uint32_t *)&p->store[i])[1],
3579		      ((uint32_t *)&p->store[i])[0]);
3580	    }
3581	    brw_disasm(stdout, &p->store[i], intel->gen);
3582	 }
3583      }
3584
3585      last_native_inst = p->nr_insn;
3586   }
3587
3588   brw_set_uip_jip(p);
3589
3590   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3591    * emit issues, it doesn't get the jump distances into the output,
3592    * which is often something we want to debug.  So this is here in
3593    * case you're doing that.
3594    */
3595   if (0) {
3596      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3597	 for (unsigned int i = 0; i < p->nr_insn; i++) {
3598	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3599		   ((uint32_t *)&p->store[i])[3],
3600		   ((uint32_t *)&p->store[i])[2],
3601		   ((uint32_t *)&p->store[i])[1],
3602		   ((uint32_t *)&p->store[i])[0]);
3603	    brw_disasm(stdout, &p->store[i], intel->gen);
3604	 }
3605      }
3606   }
3607}
3608
3609GLboolean
3610brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3611{
3612   struct intel_context *intel = &brw->intel;
3613   struct gl_context *ctx = &intel->ctx;
3614   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3615
3616   if (!prog)
3617      return GL_FALSE;
3618
3619   struct brw_shader *shader =
3620     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3621   if (!shader)
3622      return GL_FALSE;
3623
3624   /* We always use 8-wide mode, at least for now.  For one, flow
3625    * control only works in 8-wide.  Also, when we're fragment shader
3626    * bound, we're almost always under register pressure as well, so
3627    * 8-wide would save us from the performance cliff of spilling
3628    * regs.
3629    */
3630   c->dispatch_width = 8;
3631
3632   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3633      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3634      _mesa_print_ir(shader->ir, NULL);
3635      printf("\n");
3636   }
3637
3638   /* Now the main event: Visit the shader IR and generate our FS IR for it.
3639    */
3640   fs_visitor v(c, shader);
3641
3642   if (0) {
3643      v.emit_dummy_fs();
3644   } else {
3645      v.calculate_urb_setup();
3646      if (intel->gen < 6)
3647	 v.emit_interpolation_setup_gen4();
3648      else
3649	 v.emit_interpolation_setup_gen6();
3650
3651      /* Generate FS IR for main().  (the visitor only descends into
3652       * functions called "main").
3653       */
3654      foreach_iter(exec_list_iterator, iter, *shader->ir) {
3655	 ir_instruction *ir = (ir_instruction *)iter.get();
3656	 v.base_ir = ir;
3657	 ir->accept(&v);
3658      }
3659
3660      v.emit_fb_writes();
3661
3662      v.split_virtual_grfs();
3663      v.setup_pull_constants();
3664
3665      v.assign_curb_setup();
3666      v.assign_urb_setup();
3667
3668      bool progress;
3669      do {
3670	 progress = false;
3671
3672	 progress = v.remove_duplicate_mrf_writes() || progress;
3673
3674	 progress = v.propagate_constants() || progress;
3675	 progress = v.register_coalesce() || progress;
3676	 progress = v.compute_to_mrf() || progress;
3677	 progress = v.dead_code_eliminate() || progress;
3678      } while (progress);
3679
3680      if (0) {
3681	 /* Debug of register spilling: Go spill everything. */
3682	 int virtual_grf_count = v.virtual_grf_next;
3683	 for (int i = 1; i < virtual_grf_count; i++) {
3684	    v.spill_reg(i);
3685	 }
3686      }
3687
3688      if (0)
3689	 v.assign_regs_trivial();
3690      else {
3691	 while (!v.assign_regs()) {
3692	    if (v.fail)
3693	       break;
3694	 }
3695      }
3696   }
3697
3698   if (!v.fail)
3699      v.generate_code();
3700
3701   assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
3702
3703   if (v.fail)
3704      return GL_FALSE;
3705
3706   c->prog_data.total_grf = v.grf_used;
3707
3708   return GL_TRUE;
3709}
3710