brw_fs.cpp revision 963431829055f63ec94d88c97a5d07d30e49833a
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44}
45#include "brw_fs.h"
46#include "../glsl/glsl_types.h"
47#include "../glsl/ir_optimization.h"
48#include "../glsl/ir_print_visitor.h"
49
50#define MAX_INSTRUCTION (1 << 30)
51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53struct gl_shader *
54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55{
56   struct brw_shader *shader;
57
58   shader = rzalloc(NULL, struct brw_shader);
59   if (shader) {
60      shader->base.Type = type;
61      shader->base.Name = name;
62      _mesa_init_shader(ctx, &shader->base);
63   }
64
65   return &shader->base;
66}
67
68struct gl_shader_program *
69brw_new_shader_program(struct gl_context *ctx, GLuint name)
70{
71   struct brw_shader_program *prog;
72   prog = rzalloc(NULL, struct brw_shader_program);
73   if (prog) {
74      prog->base.Name = name;
75      _mesa_init_shader_program(ctx, &prog->base);
76   }
77   return &prog->base;
78}
79
80GLboolean
81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
82{
83   struct brw_context *brw = brw_context(ctx);
84   struct intel_context *intel = &brw->intel;
85
86   struct brw_shader *shader =
87      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
88   if (shader != NULL) {
89      void *mem_ctx = ralloc_context(NULL);
90      bool progress;
91
92      if (shader->ir)
93	 ralloc_free(shader->ir);
94      shader->ir = new(shader) exec_list;
95      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
96
97      do_mat_op_to_vec(shader->ir);
98      lower_instructions(shader->ir,
99			 MOD_TO_FRACT |
100			 DIV_TO_MUL_RCP |
101			 SUB_TO_ADD_NEG |
102			 EXP_TO_EXP2 |
103			 LOG_TO_LOG2);
104
105      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
106       * if-statements need to be flattened.
107       */
108      if (intel->gen < 6)
109	 lower_if_to_cond_assign(shader->ir, 16);
110
111      do_lower_texture_projection(shader->ir);
112      do_vec_index_to_cond_assign(shader->ir);
113      brw_do_cubemap_normalize(shader->ir);
114      lower_noise(shader->ir);
115      lower_quadop_vector(shader->ir, false);
116      lower_variable_index_to_cond_assign(shader->ir,
117					  GL_TRUE, /* input */
118					  GL_TRUE, /* output */
119					  GL_TRUE, /* temp */
120					  GL_TRUE /* uniform */
121					  );
122
123      do {
124	 progress = false;
125
126	 brw_do_channel_expressions(shader->ir);
127	 brw_do_vector_splitting(shader->ir);
128
129	 progress = do_lower_jumps(shader->ir, true, true,
130				   true, /* main return */
131				   false, /* continue */
132				   false /* loops */
133				   ) || progress;
134
135	 progress = do_common_optimization(shader->ir, true, 32) || progress;
136      } while (progress);
137
138      validate_ir_tree(shader->ir);
139
140      reparent_ir(shader->ir, shader->ir);
141      ralloc_free(mem_ctx);
142   }
143
144   if (!_mesa_ir_link_shader(ctx, prog))
145      return GL_FALSE;
146
147   return GL_TRUE;
148}
149
150static int
151type_size(const struct glsl_type *type)
152{
153   unsigned int size, i;
154
155   switch (type->base_type) {
156   case GLSL_TYPE_UINT:
157   case GLSL_TYPE_INT:
158   case GLSL_TYPE_FLOAT:
159   case GLSL_TYPE_BOOL:
160      return type->components();
161   case GLSL_TYPE_ARRAY:
162      return type_size(type->fields.array) * type->length;
163   case GLSL_TYPE_STRUCT:
164      size = 0;
165      for (i = 0; i < type->length; i++) {
166	 size += type_size(type->fields.structure[i].type);
167      }
168      return size;
169   case GLSL_TYPE_SAMPLER:
170      /* Samplers take up no register space, since they're baked in at
171       * link time.
172       */
173      return 0;
174   default:
175      assert(!"not reached");
176      return 0;
177   }
178}
179
180void
181fs_visitor::fail(const char *format, ...)
182{
183   if (!failed) {
184      failed = true;
185
186      if (INTEL_DEBUG & DEBUG_WM) {
187	 fprintf(stderr, "FS compile failed: ");
188
189	 va_list va;
190	 va_start(va, format);
191	 vfprintf(stderr, format, va);
192	 va_end(va);
193      }
194   }
195}
196
197/**
198 * Returns how many MRFs an FS opcode will write over.
199 *
200 * Note that this is not the 0 or 1 implied writes in an actual gen
201 * instruction -- the FS opcodes often generate MOVs in addition.
202 */
203int
204fs_visitor::implied_mrf_writes(fs_inst *inst)
205{
206   if (inst->mlen == 0)
207      return 0;
208
209   switch (inst->opcode) {
210   case FS_OPCODE_RCP:
211   case FS_OPCODE_RSQ:
212   case FS_OPCODE_SQRT:
213   case FS_OPCODE_EXP2:
214   case FS_OPCODE_LOG2:
215   case FS_OPCODE_SIN:
216   case FS_OPCODE_COS:
217      return 1;
218   case FS_OPCODE_POW:
219      return 2;
220   case FS_OPCODE_TEX:
221   case FS_OPCODE_TXB:
222   case FS_OPCODE_TXD:
223   case FS_OPCODE_TXL:
224      return 1;
225   case FS_OPCODE_FB_WRITE:
226      return 2;
227   case FS_OPCODE_PULL_CONSTANT_LOAD:
228   case FS_OPCODE_UNSPILL:
229      return 1;
230   case FS_OPCODE_SPILL:
231      return 2;
232   default:
233      assert(!"not reached");
234      return inst->mlen;
235   }
236}
237
238int
239fs_visitor::virtual_grf_alloc(int size)
240{
241   if (virtual_grf_array_size <= virtual_grf_next) {
242      if (virtual_grf_array_size == 0)
243	 virtual_grf_array_size = 16;
244      else
245	 virtual_grf_array_size *= 2;
246      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
247				   virtual_grf_array_size);
248
249      /* This slot is always unused. */
250      virtual_grf_sizes[0] = 0;
251   }
252   virtual_grf_sizes[virtual_grf_next] = size;
253   return virtual_grf_next++;
254}
255
256/** Fixed HW reg constructor. */
257fs_reg::fs_reg(enum register_file file, int hw_reg)
258{
259   init();
260   this->file = file;
261   this->hw_reg = hw_reg;
262   this->type = BRW_REGISTER_TYPE_F;
263}
264
265/** Fixed HW reg constructor. */
266fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
267{
268   init();
269   this->file = file;
270   this->hw_reg = hw_reg;
271   this->type = type;
272}
273
274int
275brw_type_for_base_type(const struct glsl_type *type)
276{
277   switch (type->base_type) {
278   case GLSL_TYPE_FLOAT:
279      return BRW_REGISTER_TYPE_F;
280   case GLSL_TYPE_INT:
281   case GLSL_TYPE_BOOL:
282      return BRW_REGISTER_TYPE_D;
283   case GLSL_TYPE_UINT:
284      return BRW_REGISTER_TYPE_UD;
285   case GLSL_TYPE_ARRAY:
286   case GLSL_TYPE_STRUCT:
287   case GLSL_TYPE_SAMPLER:
288      /* These should be overridden with the type of the member when
289       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
290       * way to trip up if we don't.
291       */
292      return BRW_REGISTER_TYPE_UD;
293   default:
294      assert(!"not reached");
295      return BRW_REGISTER_TYPE_F;
296   }
297}
298
299/** Automatic reg constructor. */
300fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
301{
302   init();
303
304   this->file = GRF;
305   this->reg = v->virtual_grf_alloc(type_size(type));
306   this->reg_offset = 0;
307   this->type = brw_type_for_base_type(type);
308}
309
310fs_reg *
311fs_visitor::variable_storage(ir_variable *var)
312{
313   return (fs_reg *)hash_table_find(this->variable_ht, var);
314}
315
316/* Our support for uniforms is piggy-backed on the struct
317 * gl_fragment_program, because that's where the values actually
318 * get stored, rather than in some global gl_shader_program uniform
319 * store.
320 */
321int
322fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
323{
324   unsigned int offset = 0;
325
326   if (type->is_matrix()) {
327      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
328							type->vector_elements,
329							1);
330
331      for (unsigned int i = 0; i < type->matrix_columns; i++) {
332	 offset += setup_uniform_values(loc + offset, column);
333      }
334
335      return offset;
336   }
337
338   switch (type->base_type) {
339   case GLSL_TYPE_FLOAT:
340   case GLSL_TYPE_UINT:
341   case GLSL_TYPE_INT:
342   case GLSL_TYPE_BOOL:
343      for (unsigned int i = 0; i < type->vector_elements; i++) {
344	 unsigned int param = c->prog_data.nr_params++;
345
346	 assert(param < ARRAY_SIZE(c->prog_data.param));
347
348	 switch (type->base_type) {
349	 case GLSL_TYPE_FLOAT:
350	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
351	    break;
352	 case GLSL_TYPE_UINT:
353	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
354	    break;
355	 case GLSL_TYPE_INT:
356	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
357	    break;
358	 case GLSL_TYPE_BOOL:
359	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
360	    break;
361	 default:
362	    assert(!"not reached");
363	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
364	    break;
365	 }
366	 this->param_index[param] = loc;
367	 this->param_offset[param] = i;
368      }
369      return 1;
370
371   case GLSL_TYPE_STRUCT:
372      for (unsigned int i = 0; i < type->length; i++) {
373	 offset += setup_uniform_values(loc + offset,
374					type->fields.structure[i].type);
375      }
376      return offset;
377
378   case GLSL_TYPE_ARRAY:
379      for (unsigned int i = 0; i < type->length; i++) {
380	 offset += setup_uniform_values(loc + offset, type->fields.array);
381      }
382      return offset;
383
384   case GLSL_TYPE_SAMPLER:
385      /* The sampler takes up a slot, but we don't use any values from it. */
386      return 1;
387
388   default:
389      assert(!"not reached");
390      return 0;
391   }
392}
393
394
395/* Our support for builtin uniforms is even scarier than non-builtin.
396 * It sits on top of the PROG_STATE_VAR parameters that are
397 * automatically updated from GL context state.
398 */
399void
400fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
401{
402   const ir_state_slot *const slots = ir->state_slots;
403   assert(ir->state_slots != NULL);
404
405   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
406      /* This state reference has already been setup by ir_to_mesa, but we'll
407       * get the same index back here.
408       */
409      int index = _mesa_add_state_reference(this->fp->Base.Parameters,
410					    (gl_state_index *)slots[i].tokens);
411
412      /* Add each of the unique swizzles of the element as a parameter.
413       * This'll end up matching the expected layout of the
414       * array/matrix/structure we're trying to fill in.
415       */
416      int last_swiz = -1;
417      for (unsigned int j = 0; j < 4; j++) {
418	 int swiz = GET_SWZ(slots[i].swizzle, j);
419	 if (swiz == last_swiz)
420	    break;
421	 last_swiz = swiz;
422
423	 c->prog_data.param_convert[c->prog_data.nr_params] =
424	    PARAM_NO_CONVERT;
425	 this->param_index[c->prog_data.nr_params] = index;
426	 this->param_offset[c->prog_data.nr_params] = swiz;
427	 c->prog_data.nr_params++;
428      }
429   }
430}
431
432fs_reg *
433fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
434{
435   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
436   fs_reg wpos = *reg;
437   fs_reg neg_y = this->pixel_y;
438   neg_y.negate = true;
439   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
440
441   /* gl_FragCoord.x */
442   if (ir->pixel_center_integer) {
443      emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
444   } else {
445      emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
446   }
447   wpos.reg_offset++;
448
449   /* gl_FragCoord.y */
450   if (!flip && ir->pixel_center_integer) {
451      emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
452   } else {
453      fs_reg pixel_y = this->pixel_y;
454      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
455
456      if (flip) {
457	 pixel_y.negate = true;
458	 offset += c->key.drawable_height - 1.0;
459      }
460
461      emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
462   }
463   wpos.reg_offset++;
464
465   /* gl_FragCoord.z */
466   if (intel->gen >= 6) {
467      emit(BRW_OPCODE_MOV, wpos,
468	   fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
469   } else {
470      emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
471	   interp_reg(FRAG_ATTRIB_WPOS, 2));
472   }
473   wpos.reg_offset++;
474
475   /* gl_FragCoord.w: Already set up in emit_interpolation */
476   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
477
478   return reg;
479}
480
481fs_reg *
482fs_visitor::emit_general_interpolation(ir_variable *ir)
483{
484   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
485   /* Interpolation is always in floating point regs. */
486   reg->type = BRW_REGISTER_TYPE_F;
487   fs_reg attr = *reg;
488
489   unsigned int array_elements;
490   const glsl_type *type;
491
492   if (ir->type->is_array()) {
493      array_elements = ir->type->length;
494      if (array_elements == 0) {
495	 fail("dereferenced array '%s' has length 0\n", ir->name);
496      }
497      type = ir->type->fields.array;
498   } else {
499      array_elements = 1;
500      type = ir->type;
501   }
502
503   int location = ir->location;
504   for (unsigned int i = 0; i < array_elements; i++) {
505      for (unsigned int j = 0; j < type->matrix_columns; j++) {
506	 if (urb_setup[location] == -1) {
507	    /* If there's no incoming setup data for this slot, don't
508	     * emit interpolation for it.
509	     */
510	    attr.reg_offset += type->vector_elements;
511	    location++;
512	    continue;
513	 }
514
515	 bool is_gl_Color =
516	    location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
517
518	 if (c->key.flat_shade && is_gl_Color) {
519	    /* Constant interpolation (flat shading) case. The SF has
520	     * handed us defined values in only the constant offset
521	     * field of the setup reg.
522	     */
523	    for (unsigned int k = 0; k < type->vector_elements; k++) {
524	       struct brw_reg interp = interp_reg(location, k);
525	       interp = suboffset(interp, 3);
526	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
527	       attr.reg_offset++;
528	    }
529	 } else {
530	    /* Perspective interpolation case. */
531	    for (unsigned int k = 0; k < type->vector_elements; k++) {
532	       struct brw_reg interp = interp_reg(location, k);
533	       emit(FS_OPCODE_LINTERP, attr,
534		    this->delta_x, this->delta_y, fs_reg(interp));
535	       attr.reg_offset++;
536	    }
537
538	    if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) {
539	       attr.reg_offset -= type->vector_elements;
540	       for (unsigned int k = 0; k < type->vector_elements; k++) {
541		  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
542		  attr.reg_offset++;
543	       }
544	    }
545	 }
546	 location++;
547      }
548   }
549
550   return reg;
551}
552
553fs_reg *
554fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
555{
556   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
557
558   /* The frontfacing comes in as a bit in the thread payload. */
559   if (intel->gen >= 6) {
560      emit(BRW_OPCODE_ASR, *reg,
561	   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
562	   fs_reg(15));
563      emit(BRW_OPCODE_NOT, *reg, *reg);
564      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
565   } else {
566      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
567      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
568       * us front face
569       */
570      fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
571			   fs_reg(r1_6ud),
572			   fs_reg(1u << 31));
573      inst->conditional_mod = BRW_CONDITIONAL_L;
574      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
575   }
576
577   return reg;
578}
579
580fs_inst *
581fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
582{
583   switch (opcode) {
584   case FS_OPCODE_RCP:
585   case FS_OPCODE_RSQ:
586   case FS_OPCODE_SQRT:
587   case FS_OPCODE_EXP2:
588   case FS_OPCODE_LOG2:
589   case FS_OPCODE_SIN:
590   case FS_OPCODE_COS:
591      break;
592   default:
593      assert(!"not reached: bad math opcode");
594      return NULL;
595   }
596
597   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
598    * might be able to do better by doing execsize = 1 math and then
599    * expanding that result out, but we would need to be careful with
600    * masking.
601    *
602    * The hardware ignores source modifiers (negate and abs) on math
603    * instructions, so we also move to a temp to set those up.
604    */
605   if (intel->gen >= 6 && (src.file == UNIFORM ||
606			   src.abs ||
607			   src.negate)) {
608      fs_reg expanded = fs_reg(this, glsl_type::float_type);
609      emit(BRW_OPCODE_MOV, expanded, src);
610      src = expanded;
611   }
612
613   fs_inst *inst = emit(opcode, dst, src);
614
615   if (intel->gen < 6) {
616      inst->base_mrf = 2;
617      inst->mlen = 1;
618   }
619
620   return inst;
621}
622
623fs_inst *
624fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
625{
626   int base_mrf = 2;
627   fs_inst *inst;
628
629   assert(opcode == FS_OPCODE_POW);
630
631   if (intel->gen >= 6) {
632      /* Can't do hstride == 0 args to gen6 math, so expand it out.
633       *
634       * The hardware ignores source modifiers (negate and abs) on math
635       * instructions, so we also move to a temp to set those up.
636       */
637      if (src0.file == UNIFORM || src0.abs || src0.negate) {
638	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
639	 emit(BRW_OPCODE_MOV, expanded, src0);
640	 src0 = expanded;
641      }
642
643      if (src1.file == UNIFORM || src1.abs || src1.negate) {
644	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
645	 emit(BRW_OPCODE_MOV, expanded, src1);
646	 src1 = expanded;
647      }
648
649      inst = emit(opcode, dst, src0, src1);
650   } else {
651      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
652      inst = emit(opcode, dst, src0, reg_null_f);
653
654      inst->base_mrf = base_mrf;
655      inst->mlen = 2;
656   }
657   return inst;
658}
659
660void
661fs_visitor::visit(ir_variable *ir)
662{
663   fs_reg *reg = NULL;
664
665   if (variable_storage(ir))
666      return;
667
668   if (strcmp(ir->name, "gl_FragColor") == 0) {
669      this->frag_color = ir;
670   } else if (strcmp(ir->name, "gl_FragData") == 0) {
671      this->frag_data = ir;
672   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
673      this->frag_depth = ir;
674   }
675
676   if (ir->mode == ir_var_in) {
677      if (!strcmp(ir->name, "gl_FragCoord")) {
678	 reg = emit_fragcoord_interpolation(ir);
679      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
680	 reg = emit_frontfacing_interpolation(ir);
681      } else {
682	 reg = emit_general_interpolation(ir);
683      }
684      assert(reg);
685      hash_table_insert(this->variable_ht, reg, ir);
686      return;
687   }
688
689   if (ir->mode == ir_var_uniform) {
690      int param_index = c->prog_data.nr_params;
691
692      if (!strncmp(ir->name, "gl_", 3)) {
693	 setup_builtin_uniform_values(ir);
694      } else {
695	 setup_uniform_values(ir->location, ir->type);
696      }
697
698      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
699      reg->type = brw_type_for_base_type(ir->type);
700   }
701
702   if (!reg)
703      reg = new(this->mem_ctx) fs_reg(this, ir->type);
704
705   hash_table_insert(this->variable_ht, reg, ir);
706}
707
708void
709fs_visitor::visit(ir_dereference_variable *ir)
710{
711   fs_reg *reg = variable_storage(ir->var);
712   this->result = *reg;
713}
714
715void
716fs_visitor::visit(ir_dereference_record *ir)
717{
718   const glsl_type *struct_type = ir->record->type;
719
720   ir->record->accept(this);
721
722   unsigned int offset = 0;
723   for (unsigned int i = 0; i < struct_type->length; i++) {
724      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
725	 break;
726      offset += type_size(struct_type->fields.structure[i].type);
727   }
728   this->result.reg_offset += offset;
729   this->result.type = brw_type_for_base_type(ir->type);
730}
731
732void
733fs_visitor::visit(ir_dereference_array *ir)
734{
735   ir_constant *index;
736   int element_size;
737
738   ir->array->accept(this);
739   index = ir->array_index->as_constant();
740
741   element_size = type_size(ir->type);
742   this->result.type = brw_type_for_base_type(ir->type);
743
744   if (index) {
745      assert(this->result.file == UNIFORM ||
746	     (this->result.file == GRF &&
747	      this->result.reg != 0));
748      this->result.reg_offset += index->value.i[0] * element_size;
749   } else {
750      assert(!"FINISHME: non-constant array element");
751   }
752}
753
754/* Instruction selection: Produce a MOV.sat instead of
755 * MIN(MAX(val, 0), 1) when possible.
756 */
757bool
758fs_visitor::try_emit_saturate(ir_expression *ir)
759{
760   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
761
762   if (!sat_val)
763      return false;
764
765   sat_val->accept(this);
766   fs_reg src = this->result;
767
768   this->result = fs_reg(this, ir->type);
769   fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
770   inst->saturate = true;
771
772   return true;
773}
774
775static uint32_t
776brw_conditional_for_comparison(unsigned int op)
777{
778   switch (op) {
779   case ir_binop_less:
780      return BRW_CONDITIONAL_L;
781   case ir_binop_greater:
782      return BRW_CONDITIONAL_G;
783   case ir_binop_lequal:
784      return BRW_CONDITIONAL_LE;
785   case ir_binop_gequal:
786      return BRW_CONDITIONAL_GE;
787   case ir_binop_equal:
788   case ir_binop_all_equal: /* same as equal for scalars */
789      return BRW_CONDITIONAL_Z;
790   case ir_binop_nequal:
791   case ir_binop_any_nequal: /* same as nequal for scalars */
792      return BRW_CONDITIONAL_NZ;
793   default:
794      assert(!"not reached: bad operation for comparison");
795      return BRW_CONDITIONAL_NZ;
796   }
797}
798
799void
800fs_visitor::visit(ir_expression *ir)
801{
802   unsigned int operand;
803   fs_reg op[2], temp;
804   fs_inst *inst;
805
806   assert(ir->get_num_operands() <= 2);
807
808   if (try_emit_saturate(ir))
809      return;
810
811   for (operand = 0; operand < ir->get_num_operands(); operand++) {
812      ir->operands[operand]->accept(this);
813      if (this->result.file == BAD_FILE) {
814	 ir_print_visitor v;
815	 fail("Failed to get tree for expression operand:\n");
816	 ir->operands[operand]->accept(&v);
817      }
818      op[operand] = this->result;
819
820      /* Matrix expression operands should have been broken down to vector
821       * operations already.
822       */
823      assert(!ir->operands[operand]->type->is_matrix());
824      /* And then those vector operands should have been broken down to scalar.
825       */
826      assert(!ir->operands[operand]->type->is_vector());
827   }
828
829   /* Storage for our result.  If our result goes into an assignment, it will
830    * just get copy-propagated out, so no worries.
831    */
832   this->result = fs_reg(this, ir->type);
833
834   switch (ir->operation) {
835   case ir_unop_logic_not:
836      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
837       * ones complement of the whole register, not just bit 0.
838       */
839      emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
840      break;
841   case ir_unop_neg:
842      op[0].negate = !op[0].negate;
843      this->result = op[0];
844      break;
845   case ir_unop_abs:
846      op[0].abs = true;
847      op[0].negate = false;
848      this->result = op[0];
849      break;
850   case ir_unop_sign:
851      temp = fs_reg(this, ir->type);
852
853      emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
854
855      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
856      inst->conditional_mod = BRW_CONDITIONAL_G;
857      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
858      inst->predicated = true;
859
860      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
861      inst->conditional_mod = BRW_CONDITIONAL_L;
862      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
863      inst->predicated = true;
864
865      break;
866   case ir_unop_rcp:
867      emit_math(FS_OPCODE_RCP, this->result, op[0]);
868      break;
869
870   case ir_unop_exp2:
871      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
872      break;
873   case ir_unop_log2:
874      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
875      break;
876   case ir_unop_exp:
877   case ir_unop_log:
878      assert(!"not reached: should be handled by ir_explog_to_explog2");
879      break;
880   case ir_unop_sin:
881   case ir_unop_sin_reduced:
882      emit_math(FS_OPCODE_SIN, this->result, op[0]);
883      break;
884   case ir_unop_cos:
885   case ir_unop_cos_reduced:
886      emit_math(FS_OPCODE_COS, this->result, op[0]);
887      break;
888
889   case ir_unop_dFdx:
890      emit(FS_OPCODE_DDX, this->result, op[0]);
891      break;
892   case ir_unop_dFdy:
893      emit(FS_OPCODE_DDY, this->result, op[0]);
894      break;
895
896   case ir_binop_add:
897      emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
898      break;
899   case ir_binop_sub:
900      assert(!"not reached: should be handled by ir_sub_to_add_neg");
901      break;
902
903   case ir_binop_mul:
904      emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
905      break;
906   case ir_binop_div:
907      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
908      break;
909   case ir_binop_mod:
910      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
911      break;
912
913   case ir_binop_less:
914   case ir_binop_greater:
915   case ir_binop_lequal:
916   case ir_binop_gequal:
917   case ir_binop_equal:
918   case ir_binop_all_equal:
919   case ir_binop_nequal:
920   case ir_binop_any_nequal:
921      temp = this->result;
922      /* original gen4 does implicit conversion before comparison. */
923      if (intel->gen < 5)
924	 temp.type = op[0].type;
925
926      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
927      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
928      emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
929      break;
930
931   case ir_binop_logic_xor:
932      emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
933      break;
934
935   case ir_binop_logic_or:
936      emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
937      break;
938
939   case ir_binop_logic_and:
940      emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
941      break;
942
943   case ir_binop_dot:
944   case ir_unop_any:
945      assert(!"not reached: should be handled by brw_fs_channel_expressions");
946      break;
947
948   case ir_unop_noise:
949      assert(!"not reached: should be handled by lower_noise");
950      break;
951
952   case ir_quadop_vector:
953      assert(!"not reached: should be handled by lower_quadop_vector");
954      break;
955
956   case ir_unop_sqrt:
957      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
958      break;
959
960   case ir_unop_rsq:
961      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
962      break;
963
964   case ir_unop_i2f:
965   case ir_unop_b2f:
966   case ir_unop_b2i:
967   case ir_unop_f2i:
968      emit(BRW_OPCODE_MOV, this->result, op[0]);
969      break;
970   case ir_unop_f2b:
971   case ir_unop_i2b:
972      temp = this->result;
973      /* original gen4 does implicit conversion before comparison. */
974      if (intel->gen < 5)
975	 temp.type = op[0].type;
976
977      inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
978      inst->conditional_mod = BRW_CONDITIONAL_NZ;
979      inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
980      break;
981
982   case ir_unop_trunc:
983      emit(BRW_OPCODE_RNDZ, this->result, op[0]);
984      break;
985   case ir_unop_ceil:
986      op[0].negate = !op[0].negate;
987      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
988      this->result.negate = true;
989      break;
990   case ir_unop_floor:
991      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
992      break;
993   case ir_unop_fract:
994      inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
995      break;
996   case ir_unop_round_even:
997      emit(BRW_OPCODE_RNDE, this->result, op[0]);
998      break;
999
1000   case ir_binop_min:
1001      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1002      inst->conditional_mod = BRW_CONDITIONAL_L;
1003
1004      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1005      inst->predicated = true;
1006      break;
1007   case ir_binop_max:
1008      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1009      inst->conditional_mod = BRW_CONDITIONAL_G;
1010
1011      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1012      inst->predicated = true;
1013      break;
1014
1015   case ir_binop_pow:
1016      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1017      break;
1018
1019   case ir_unop_bit_not:
1020      inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
1021      break;
1022   case ir_binop_bit_and:
1023      inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
1024      break;
1025   case ir_binop_bit_xor:
1026      inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
1027      break;
1028   case ir_binop_bit_or:
1029      inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
1030      break;
1031
1032   case ir_unop_u2f:
1033   case ir_binop_lshift:
1034   case ir_binop_rshift:
1035      assert(!"GLSL 1.30 features unsupported");
1036      break;
1037   }
1038}
1039
1040void
1041fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1042				   const glsl_type *type, bool predicated)
1043{
1044   switch (type->base_type) {
1045   case GLSL_TYPE_FLOAT:
1046   case GLSL_TYPE_UINT:
1047   case GLSL_TYPE_INT:
1048   case GLSL_TYPE_BOOL:
1049      for (unsigned int i = 0; i < type->components(); i++) {
1050	 l.type = brw_type_for_base_type(type);
1051	 r.type = brw_type_for_base_type(type);
1052
1053	 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
1054	 inst->predicated = predicated;
1055
1056	 l.reg_offset++;
1057	 r.reg_offset++;
1058      }
1059      break;
1060   case GLSL_TYPE_ARRAY:
1061      for (unsigned int i = 0; i < type->length; i++) {
1062	 emit_assignment_writes(l, r, type->fields.array, predicated);
1063      }
1064      break;
1065
1066   case GLSL_TYPE_STRUCT:
1067      for (unsigned int i = 0; i < type->length; i++) {
1068	 emit_assignment_writes(l, r, type->fields.structure[i].type,
1069				predicated);
1070      }
1071      break;
1072
1073   case GLSL_TYPE_SAMPLER:
1074      break;
1075
1076   default:
1077      assert(!"not reached");
1078      break;
1079   }
1080}
1081
1082void
1083fs_visitor::visit(ir_assignment *ir)
1084{
1085   struct fs_reg l, r;
1086   fs_inst *inst;
1087
1088   /* FINISHME: arrays on the lhs */
1089   ir->lhs->accept(this);
1090   l = this->result;
1091
1092   ir->rhs->accept(this);
1093   r = this->result;
1094
1095   assert(l.file != BAD_FILE);
1096   assert(r.file != BAD_FILE);
1097
1098   if (ir->condition) {
1099      emit_bool_to_cond_code(ir->condition);
1100   }
1101
1102   if (ir->lhs->type->is_scalar() ||
1103       ir->lhs->type->is_vector()) {
1104      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1105	 if (ir->write_mask & (1 << i)) {
1106	    inst = emit(BRW_OPCODE_MOV, l, r);
1107	    if (ir->condition)
1108	       inst->predicated = true;
1109	    r.reg_offset++;
1110	 }
1111	 l.reg_offset++;
1112      }
1113   } else {
1114      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1115   }
1116}
1117
1118fs_inst *
1119fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1120{
1121   int mlen;
1122   int base_mrf = 1;
1123   bool simd16 = false;
1124   fs_reg orig_dst;
1125
1126   /* g0 header. */
1127   mlen = 1;
1128
1129   if (ir->shadow_comparitor) {
1130      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1131	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1132	 coordinate.reg_offset++;
1133      }
1134      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1135      mlen += 3;
1136
1137      if (ir->op == ir_tex) {
1138	 /* There's no plain shadow compare message, so we use shadow
1139	  * compare with a bias of 0.0.
1140	  */
1141	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1142	 mlen++;
1143      } else if (ir->op == ir_txb) {
1144	 ir->lod_info.bias->accept(this);
1145	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1146	 mlen++;
1147      } else {
1148	 assert(ir->op == ir_txl);
1149	 ir->lod_info.lod->accept(this);
1150	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1151	 mlen++;
1152      }
1153
1154      ir->shadow_comparitor->accept(this);
1155      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1156      mlen++;
1157   } else if (ir->op == ir_tex) {
1158      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1159	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1160	 coordinate.reg_offset++;
1161      }
1162      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1163      mlen += 3;
1164   } else if (ir->op == ir_txd) {
1165      assert(!"TXD isn't supported on gen4 yet.");
1166   } else {
1167      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1168       * instructions.  We'll need to do SIMD16 here.
1169       */
1170      assert(ir->op == ir_txb || ir->op == ir_txl);
1171
1172      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1173	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate);
1174	 coordinate.reg_offset++;
1175      }
1176
1177      /* lod/bias appears after u/v/r. */
1178      mlen += 6;
1179
1180      if (ir->op == ir_txb) {
1181	 ir->lod_info.bias->accept(this);
1182	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1183	 mlen++;
1184      } else {
1185	 ir->lod_info.lod->accept(this);
1186	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1187	 mlen++;
1188      }
1189
1190      /* The unused upper half. */
1191      mlen++;
1192
1193      /* Now, since we're doing simd16, the return is 2 interleaved
1194       * vec4s where the odd-indexed ones are junk. We'll need to move
1195       * this weirdness around to the expected layout.
1196       */
1197      simd16 = true;
1198      orig_dst = dst;
1199      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1200						       2));
1201      dst.type = BRW_REGISTER_TYPE_F;
1202   }
1203
1204   fs_inst *inst = NULL;
1205   switch (ir->op) {
1206   case ir_tex:
1207      inst = emit(FS_OPCODE_TEX, dst);
1208      break;
1209   case ir_txb:
1210      inst = emit(FS_OPCODE_TXB, dst);
1211      break;
1212   case ir_txl:
1213      inst = emit(FS_OPCODE_TXL, dst);
1214      break;
1215   case ir_txd:
1216      inst = emit(FS_OPCODE_TXD, dst);
1217      break;
1218   case ir_txf:
1219      assert(!"GLSL 1.30 features unsupported");
1220      break;
1221   }
1222   inst->base_mrf = base_mrf;
1223   inst->mlen = mlen;
1224
1225   if (simd16) {
1226      for (int i = 0; i < 4; i++) {
1227	 emit(BRW_OPCODE_MOV, orig_dst, dst);
1228	 orig_dst.reg_offset++;
1229	 dst.reg_offset += 2;
1230      }
1231   }
1232
1233   return inst;
1234}
1235
1236fs_inst *
1237fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1238{
1239   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1240    * optional parameters like shadow comparitor or LOD bias.  If
1241    * optional parameters aren't present, those base slots are
1242    * optional and don't need to be included in the message.
1243    *
1244    * We don't fill in the unnecessary slots regardless, which may
1245    * look surprising in the disassembly.
1246    */
1247   int mlen = 1; /* g0 header always present. */
1248   int base_mrf = 1;
1249
1250   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1251      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1252      coordinate.reg_offset++;
1253   }
1254   mlen += ir->coordinate->type->vector_elements;
1255
1256   if (ir->shadow_comparitor) {
1257      mlen = MAX2(mlen, 5);
1258
1259      ir->shadow_comparitor->accept(this);
1260      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1261      mlen++;
1262   }
1263
1264   fs_inst *inst = NULL;
1265   switch (ir->op) {
1266   case ir_tex:
1267      inst = emit(FS_OPCODE_TEX, dst);
1268      break;
1269   case ir_txb:
1270      ir->lod_info.bias->accept(this);
1271      mlen = MAX2(mlen, 5);
1272      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1273      mlen++;
1274
1275      inst = emit(FS_OPCODE_TXB, dst);
1276      break;
1277   case ir_txl:
1278      ir->lod_info.lod->accept(this);
1279      mlen = MAX2(mlen, 5);
1280      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1281      mlen++;
1282
1283      inst = emit(FS_OPCODE_TXL, dst);
1284      break;
1285   case ir_txd:
1286   case ir_txf:
1287      assert(!"GLSL 1.30 features unsupported");
1288      break;
1289   }
1290   inst->base_mrf = base_mrf;
1291   inst->mlen = mlen;
1292
1293   return inst;
1294}
1295
1296void
1297fs_visitor::visit(ir_texture *ir)
1298{
1299   int sampler;
1300   fs_inst *inst = NULL;
1301
1302   ir->coordinate->accept(this);
1303   fs_reg coordinate = this->result;
1304
1305   if (ir->offset != NULL) {
1306      ir_constant *offset = ir->offset->as_constant();
1307      assert(offset != NULL);
1308
1309      signed char offsets[3];
1310      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1311	 offsets[i] = (signed char) offset->value.i[i];
1312
1313      /* Combine all three offsets into a single unsigned dword:
1314       *
1315       *    bits 11:8 - U Offset (X component)
1316       *    bits  7:4 - V Offset (Y component)
1317       *    bits  3:0 - R Offset (Z component)
1318       */
1319      unsigned offset_bits = 0;
1320      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1321	 const unsigned shift = 4 * (2 - i);
1322	 offset_bits |= (offsets[i] << shift) & (0xF << shift);
1323      }
1324
1325      /* Explicitly set up the message header by copying g0 to msg reg m1. */
1326      emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1327	   fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1328
1329      /* Then set the offset bits in DWord 2 of the message header. */
1330      emit(BRW_OPCODE_MOV,
1331	   fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1332			 BRW_REGISTER_TYPE_UD)),
1333	   fs_reg(brw_imm_uw(offset_bits)));
1334   }
1335
1336   /* Should be lowered by do_lower_texture_projection */
1337   assert(!ir->projector);
1338
1339   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1340					     ctx->Shader.CurrentFragmentProgram,
1341					     &brw->fragment_program->Base);
1342   sampler = c->fp->program.Base.SamplerUnits[sampler];
1343
1344   /* The 965 requires the EU to do the normalization of GL rectangle
1345    * texture coordinates.  We use the program parameter state
1346    * tracking to get the scaling factor.
1347    */
1348   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1349      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1350      int tokens[STATE_LENGTH] = {
1351	 STATE_INTERNAL,
1352	 STATE_TEXRECT_SCALE,
1353	 sampler,
1354	 0,
1355	 0
1356      };
1357
1358      c->prog_data.param_convert[c->prog_data.nr_params] =
1359	 PARAM_NO_CONVERT;
1360      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1361	 PARAM_NO_CONVERT;
1362
1363      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1364      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1365      GLuint index = _mesa_add_state_reference(params,
1366					       (gl_state_index *)tokens);
1367
1368      this->param_index[c->prog_data.nr_params] = index;
1369      this->param_offset[c->prog_data.nr_params] = 0;
1370      c->prog_data.nr_params++;
1371      this->param_index[c->prog_data.nr_params] = index;
1372      this->param_offset[c->prog_data.nr_params] = 1;
1373      c->prog_data.nr_params++;
1374
1375      fs_reg dst = fs_reg(this, ir->coordinate->type);
1376      fs_reg src = coordinate;
1377      coordinate = dst;
1378
1379      emit(BRW_OPCODE_MUL, dst, src, scale_x);
1380      dst.reg_offset++;
1381      src.reg_offset++;
1382      emit(BRW_OPCODE_MUL, dst, src, scale_y);
1383   }
1384
1385   /* Writemasking doesn't eliminate channels on SIMD8 texture
1386    * samples, so don't worry about them.
1387    */
1388   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1389
1390   if (intel->gen < 5) {
1391      inst = emit_texture_gen4(ir, dst, coordinate);
1392   } else {
1393      inst = emit_texture_gen5(ir, dst, coordinate);
1394   }
1395
1396   /* If there's an offset, we already set up m1.  To avoid the implied move,
1397    * use the null register.  Otherwise, we want an implied move from g0.
1398    */
1399   if (ir->offset != NULL)
1400      inst->src[0] = fs_reg(brw_null_reg());
1401   else
1402      inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1403
1404   inst->sampler = sampler;
1405
1406   this->result = dst;
1407
1408   if (ir->shadow_comparitor)
1409      inst->shadow_compare = true;
1410
1411   if (ir->type == glsl_type::float_type) {
1412      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1413      assert(ir->sampler->type->sampler_shadow);
1414   } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1415      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1416
1417      for (int i = 0; i < 4; i++) {
1418	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1419	 fs_reg l = swizzle_dst;
1420	 l.reg_offset += i;
1421
1422	 if (swiz == SWIZZLE_ZERO) {
1423	    emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1424	 } else if (swiz == SWIZZLE_ONE) {
1425	    emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1426	 } else {
1427	    fs_reg r = dst;
1428	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1429	    emit(BRW_OPCODE_MOV, l, r);
1430	 }
1431      }
1432      this->result = swizzle_dst;
1433   }
1434}
1435
1436void
1437fs_visitor::visit(ir_swizzle *ir)
1438{
1439   ir->val->accept(this);
1440   fs_reg val = this->result;
1441
1442   if (ir->type->vector_elements == 1) {
1443      this->result.reg_offset += ir->mask.x;
1444      return;
1445   }
1446
1447   fs_reg result = fs_reg(this, ir->type);
1448   this->result = result;
1449
1450   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1451      fs_reg channel = val;
1452      int swiz = 0;
1453
1454      switch (i) {
1455      case 0:
1456	 swiz = ir->mask.x;
1457	 break;
1458      case 1:
1459	 swiz = ir->mask.y;
1460	 break;
1461      case 2:
1462	 swiz = ir->mask.z;
1463	 break;
1464      case 3:
1465	 swiz = ir->mask.w;
1466	 break;
1467      }
1468
1469      channel.reg_offset += swiz;
1470      emit(BRW_OPCODE_MOV, result, channel);
1471      result.reg_offset++;
1472   }
1473}
1474
1475void
1476fs_visitor::visit(ir_discard *ir)
1477{
1478   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1479
1480   assert(ir->condition == NULL); /* FINISHME */
1481
1482   emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d);
1483   emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp);
1484   kill_emitted = true;
1485}
1486
1487void
1488fs_visitor::visit(ir_constant *ir)
1489{
1490   /* Set this->result to reg at the bottom of the function because some code
1491    * paths will cause this visitor to be applied to other fields.  This will
1492    * cause the value stored in this->result to be modified.
1493    *
1494    * Make reg constant so that it doesn't get accidentally modified along the
1495    * way.  Yes, I actually had this problem. :(
1496    */
1497   const fs_reg reg(this, ir->type);
1498   fs_reg dst_reg = reg;
1499
1500   if (ir->type->is_array()) {
1501      const unsigned size = type_size(ir->type->fields.array);
1502
1503      for (unsigned i = 0; i < ir->type->length; i++) {
1504	 ir->array_elements[i]->accept(this);
1505	 fs_reg src_reg = this->result;
1506
1507	 dst_reg.type = src_reg.type;
1508	 for (unsigned j = 0; j < size; j++) {
1509	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1510	    src_reg.reg_offset++;
1511	    dst_reg.reg_offset++;
1512	 }
1513      }
1514   } else if (ir->type->is_record()) {
1515      foreach_list(node, &ir->components) {
1516	 ir_instruction *const field = (ir_instruction *) node;
1517	 const unsigned size = type_size(field->type);
1518
1519	 field->accept(this);
1520	 fs_reg src_reg = this->result;
1521
1522	 dst_reg.type = src_reg.type;
1523	 for (unsigned j = 0; j < size; j++) {
1524	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1525	    src_reg.reg_offset++;
1526	    dst_reg.reg_offset++;
1527	 }
1528      }
1529   } else {
1530      const unsigned size = type_size(ir->type);
1531
1532      for (unsigned i = 0; i < size; i++) {
1533	 switch (ir->type->base_type) {
1534	 case GLSL_TYPE_FLOAT:
1535	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1536	    break;
1537	 case GLSL_TYPE_UINT:
1538	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1539	    break;
1540	 case GLSL_TYPE_INT:
1541	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1542	    break;
1543	 case GLSL_TYPE_BOOL:
1544	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1545	    break;
1546	 default:
1547	    assert(!"Non-float/uint/int/bool constant");
1548	 }
1549	 dst_reg.reg_offset++;
1550      }
1551   }
1552
1553   this->result = reg;
1554}
1555
1556void
1557fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1558{
1559   ir_expression *expr = ir->as_expression();
1560
1561   if (expr) {
1562      fs_reg op[2];
1563      fs_inst *inst;
1564
1565      assert(expr->get_num_operands() <= 2);
1566      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1567	 assert(expr->operands[i]->type->is_scalar());
1568
1569	 expr->operands[i]->accept(this);
1570	 op[i] = this->result;
1571      }
1572
1573      switch (expr->operation) {
1574      case ir_unop_logic_not:
1575	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1576	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1577	 break;
1578
1579      case ir_binop_logic_xor:
1580	 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1581	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1582	 break;
1583
1584      case ir_binop_logic_or:
1585	 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1586	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1587	 break;
1588
1589      case ir_binop_logic_and:
1590	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1591	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1592	 break;
1593
1594      case ir_unop_f2b:
1595	 if (intel->gen >= 6) {
1596	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1597	 } else {
1598	    inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1599	 }
1600	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1601	 break;
1602
1603      case ir_unop_i2b:
1604	 if (intel->gen >= 6) {
1605	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1606	 } else {
1607	    inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1608	 }
1609	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1610	 break;
1611
1612      case ir_binop_greater:
1613      case ir_binop_gequal:
1614      case ir_binop_less:
1615      case ir_binop_lequal:
1616      case ir_binop_equal:
1617      case ir_binop_all_equal:
1618      case ir_binop_nequal:
1619      case ir_binop_any_nequal:
1620	 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1621	 inst->conditional_mod =
1622	    brw_conditional_for_comparison(expr->operation);
1623	 break;
1624
1625      default:
1626	 assert(!"not reached");
1627	 fail("bad cond code\n");
1628	 break;
1629      }
1630      return;
1631   }
1632
1633   ir->accept(this);
1634
1635   if (intel->gen >= 6) {
1636      fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1637      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1638   } else {
1639      fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1640      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1641   }
1642}
1643
1644/**
1645 * Emit a gen6 IF statement with the comparison folded into the IF
1646 * instruction.
1647 */
1648void
1649fs_visitor::emit_if_gen6(ir_if *ir)
1650{
1651   ir_expression *expr = ir->condition->as_expression();
1652
1653   if (expr) {
1654      fs_reg op[2];
1655      fs_inst *inst;
1656      fs_reg temp;
1657
1658      assert(expr->get_num_operands() <= 2);
1659      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1660	 assert(expr->operands[i]->type->is_scalar());
1661
1662	 expr->operands[i]->accept(this);
1663	 op[i] = this->result;
1664      }
1665
1666      switch (expr->operation) {
1667      case ir_unop_logic_not:
1668	 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1669	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1670	 return;
1671
1672      case ir_binop_logic_xor:
1673	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1674	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1675	 return;
1676
1677      case ir_binop_logic_or:
1678	 temp = fs_reg(this, glsl_type::bool_type);
1679	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1680	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1681	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1682	 return;
1683
1684      case ir_binop_logic_and:
1685	 temp = fs_reg(this, glsl_type::bool_type);
1686	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1687	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1688	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1689	 return;
1690
1691      case ir_unop_f2b:
1692	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1693	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1694	 return;
1695
1696      case ir_unop_i2b:
1697	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1698	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1699	 return;
1700
1701      case ir_binop_greater:
1702      case ir_binop_gequal:
1703      case ir_binop_less:
1704      case ir_binop_lequal:
1705      case ir_binop_equal:
1706      case ir_binop_all_equal:
1707      case ir_binop_nequal:
1708      case ir_binop_any_nequal:
1709	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1710	 inst->conditional_mod =
1711	    brw_conditional_for_comparison(expr->operation);
1712	 return;
1713      default:
1714	 assert(!"not reached");
1715	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1716	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1717	 fail("bad condition\n");
1718	 return;
1719      }
1720      return;
1721   }
1722
1723   ir->condition->accept(this);
1724
1725   fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1726   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1727}
1728
1729void
1730fs_visitor::visit(ir_if *ir)
1731{
1732   fs_inst *inst;
1733
1734   /* Don't point the annotation at the if statement, because then it plus
1735    * the then and else blocks get printed.
1736    */
1737   this->base_ir = ir->condition;
1738
1739   if (intel->gen >= 6) {
1740      emit_if_gen6(ir);
1741   } else {
1742      emit_bool_to_cond_code(ir->condition);
1743
1744      inst = emit(BRW_OPCODE_IF);
1745      inst->predicated = true;
1746   }
1747
1748   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1749      ir_instruction *ir = (ir_instruction *)iter.get();
1750      this->base_ir = ir;
1751
1752      ir->accept(this);
1753   }
1754
1755   if (!ir->else_instructions.is_empty()) {
1756      emit(BRW_OPCODE_ELSE);
1757
1758      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1759	 ir_instruction *ir = (ir_instruction *)iter.get();
1760	 this->base_ir = ir;
1761
1762	 ir->accept(this);
1763      }
1764   }
1765
1766   emit(BRW_OPCODE_ENDIF);
1767}
1768
1769void
1770fs_visitor::visit(ir_loop *ir)
1771{
1772   fs_reg counter = reg_undef;
1773
1774   if (ir->counter) {
1775      this->base_ir = ir->counter;
1776      ir->counter->accept(this);
1777      counter = *(variable_storage(ir->counter));
1778
1779      if (ir->from) {
1780	 this->base_ir = ir->from;
1781	 ir->from->accept(this);
1782
1783	 emit(BRW_OPCODE_MOV, counter, this->result);
1784      }
1785   }
1786
1787   emit(BRW_OPCODE_DO);
1788
1789   if (ir->to) {
1790      this->base_ir = ir->to;
1791      ir->to->accept(this);
1792
1793      fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1794      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1795
1796      inst = emit(BRW_OPCODE_BREAK);
1797      inst->predicated = true;
1798   }
1799
1800   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1801      ir_instruction *ir = (ir_instruction *)iter.get();
1802
1803      this->base_ir = ir;
1804      ir->accept(this);
1805   }
1806
1807   if (ir->increment) {
1808      this->base_ir = ir->increment;
1809      ir->increment->accept(this);
1810      emit(BRW_OPCODE_ADD, counter, counter, this->result);
1811   }
1812
1813   emit(BRW_OPCODE_WHILE);
1814}
1815
1816void
1817fs_visitor::visit(ir_loop_jump *ir)
1818{
1819   switch (ir->mode) {
1820   case ir_loop_jump::jump_break:
1821      emit(BRW_OPCODE_BREAK);
1822      break;
1823   case ir_loop_jump::jump_continue:
1824      emit(BRW_OPCODE_CONTINUE);
1825      break;
1826   }
1827}
1828
1829void
1830fs_visitor::visit(ir_call *ir)
1831{
1832   assert(!"FINISHME");
1833}
1834
1835void
1836fs_visitor::visit(ir_return *ir)
1837{
1838   assert(!"FINISHME");
1839}
1840
1841void
1842fs_visitor::visit(ir_function *ir)
1843{
1844   /* Ignore function bodies other than main() -- we shouldn't see calls to
1845    * them since they should all be inlined before we get to ir_to_mesa.
1846    */
1847   if (strcmp(ir->name, "main") == 0) {
1848      const ir_function_signature *sig;
1849      exec_list empty;
1850
1851      sig = ir->matching_signature(&empty);
1852
1853      assert(sig);
1854
1855      foreach_iter(exec_list_iterator, iter, sig->body) {
1856	 ir_instruction *ir = (ir_instruction *)iter.get();
1857	 this->base_ir = ir;
1858
1859	 ir->accept(this);
1860      }
1861   }
1862}
1863
1864void
1865fs_visitor::visit(ir_function_signature *ir)
1866{
1867   assert(!"not reached");
1868   (void)ir;
1869}
1870
1871fs_inst *
1872fs_visitor::emit(fs_inst inst)
1873{
1874   fs_inst *list_inst = new(mem_ctx) fs_inst;
1875   *list_inst = inst;
1876
1877   list_inst->annotation = this->current_annotation;
1878   list_inst->ir = this->base_ir;
1879
1880   this->instructions.push_tail(list_inst);
1881
1882   return list_inst;
1883}
1884
1885/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1886void
1887fs_visitor::emit_dummy_fs()
1888{
1889   /* Everyone's favorite color. */
1890   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
1891   emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
1892   emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
1893   emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
1894
1895   fs_inst *write;
1896   write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1897   write->base_mrf = 0;
1898}
1899
1900/* The register location here is relative to the start of the URB
1901 * data.  It will get adjusted to be a real location before
1902 * generate_code() time.
1903 */
1904struct brw_reg
1905fs_visitor::interp_reg(int location, int channel)
1906{
1907   int regnr = urb_setup[location] * 2 + channel / 2;
1908   int stride = (channel & 1) * 4;
1909
1910   assert(urb_setup[location] != -1);
1911
1912   return brw_vec1_grf(regnr, stride);
1913}
1914
1915/** Emits the interpolation for the varying inputs. */
1916void
1917fs_visitor::emit_interpolation_setup_gen4()
1918{
1919   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1920
1921   this->current_annotation = "compute pixel centers";
1922   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1923   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1924   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1925   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1926   emit(BRW_OPCODE_ADD,
1927	this->pixel_x,
1928	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1929	fs_reg(brw_imm_v(0x10101010)));
1930   emit(BRW_OPCODE_ADD,
1931	this->pixel_y,
1932	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1933	fs_reg(brw_imm_v(0x11001100)));
1934
1935   this->current_annotation = "compute pixel deltas from v0";
1936   if (brw->has_pln) {
1937      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1938      this->delta_y = this->delta_x;
1939      this->delta_y.reg_offset++;
1940   } else {
1941      this->delta_x = fs_reg(this, glsl_type::float_type);
1942      this->delta_y = fs_reg(this, glsl_type::float_type);
1943   }
1944   emit(BRW_OPCODE_ADD, this->delta_x,
1945	this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1946   emit(BRW_OPCODE_ADD, this->delta_y,
1947	this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1948
1949   this->current_annotation = "compute pos.w and 1/pos.w";
1950   /* Compute wpos.w.  It's always in our setup, since it's needed to
1951    * interpolate the other attributes.
1952    */
1953   this->wpos_w = fs_reg(this, glsl_type::float_type);
1954   emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1955	interp_reg(FRAG_ATTRIB_WPOS, 3));
1956   /* Compute the pixel 1/W value from wpos.w. */
1957   this->pixel_w = fs_reg(this, glsl_type::float_type);
1958   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1959   this->current_annotation = NULL;
1960}
1961
1962/** Emits the interpolation for the varying inputs. */
1963void
1964fs_visitor::emit_interpolation_setup_gen6()
1965{
1966   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1967
1968   /* If the pixel centers end up used, the setup is the same as for gen4. */
1969   this->current_annotation = "compute pixel centers";
1970   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1971   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1972   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1973   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1974   emit(BRW_OPCODE_ADD,
1975	int_pixel_x,
1976	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1977	fs_reg(brw_imm_v(0x10101010)));
1978   emit(BRW_OPCODE_ADD,
1979	int_pixel_y,
1980	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1981	fs_reg(brw_imm_v(0x11001100)));
1982
1983   /* As of gen6, we can no longer mix float and int sources.  We have
1984    * to turn the integer pixel centers into floats for their actual
1985    * use.
1986    */
1987   this->pixel_x = fs_reg(this, glsl_type::float_type);
1988   this->pixel_y = fs_reg(this, glsl_type::float_type);
1989   emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
1990   emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
1991
1992   this->current_annotation = "compute pos.w";
1993   this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
1994   this->wpos_w = fs_reg(this, glsl_type::float_type);
1995   emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w);
1996
1997   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
1998   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
1999
2000   this->current_annotation = NULL;
2001}
2002
2003void
2004fs_visitor::emit_fb_writes()
2005{
2006   this->current_annotation = "FB write header";
2007   GLboolean header_present = GL_TRUE;
2008   int nr = 0;
2009
2010   if (intel->gen >= 6 &&
2011       !this->kill_emitted &&
2012       c->key.nr_color_regions == 1) {
2013      header_present = false;
2014   }
2015
2016   if (header_present) {
2017      /* m0, m1 header */
2018      nr += 2;
2019   }
2020
2021   if (c->aa_dest_stencil_reg) {
2022      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2023	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2024   }
2025
2026   /* Reserve space for color. It'll be filled in per MRT below. */
2027   int color_mrf = nr;
2028   nr += 4;
2029
2030   if (c->source_depth_to_render_target) {
2031      if (c->computes_depth) {
2032	 /* Hand over gl_FragDepth. */
2033	 assert(this->frag_depth);
2034	 fs_reg depth = *(variable_storage(this->frag_depth));
2035
2036	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
2037      } else {
2038	 /* Pass through the payload depth. */
2039	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2040	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2041      }
2042   }
2043
2044   if (c->dest_depth_reg) {
2045      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2046	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2047   }
2048
2049   fs_reg color = reg_undef;
2050   if (this->frag_color)
2051      color = *(variable_storage(this->frag_color));
2052   else if (this->frag_data) {
2053      color = *(variable_storage(this->frag_data));
2054      color.type = BRW_REGISTER_TYPE_F;
2055   }
2056
2057   for (int target = 0; target < c->key.nr_color_regions; target++) {
2058      this->current_annotation = ralloc_asprintf(this->mem_ctx,
2059						 "FB write target %d",
2060						 target);
2061      if (this->frag_color || this->frag_data) {
2062	 for (int i = 0; i < 4; i++) {
2063	    emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
2064	    color.reg_offset++;
2065	 }
2066      }
2067
2068      if (this->frag_color)
2069	 color.reg_offset -= 4;
2070
2071      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2072      inst->target = target;
2073      inst->base_mrf = 0;
2074      inst->mlen = nr;
2075      if (target == c->key.nr_color_regions - 1)
2076	 inst->eot = true;
2077      inst->header_present = header_present;
2078   }
2079
2080   if (c->key.nr_color_regions == 0) {
2081      if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2082	 /* If the alpha test is enabled but there's no color buffer,
2083	  * we still need to send alpha out the pipeline to our null
2084	  * renderbuffer.
2085	  */
2086	 color.reg_offset += 3;
2087	 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color);
2088      }
2089
2090      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2091      inst->base_mrf = 0;
2092      inst->mlen = nr;
2093      inst->eot = true;
2094      inst->header_present = header_present;
2095   }
2096
2097   this->current_annotation = NULL;
2098}
2099
2100void
2101fs_visitor::generate_fb_write(fs_inst *inst)
2102{
2103   GLboolean eot = inst->eot;
2104   struct brw_reg implied_header;
2105
2106   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2107    * move, here's g1.
2108    */
2109   brw_push_insn_state(p);
2110   brw_set_mask_control(p, BRW_MASK_DISABLE);
2111   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2112
2113   if (inst->header_present) {
2114      if (intel->gen >= 6) {
2115	 brw_MOV(p,
2116		 brw_message_reg(inst->base_mrf),
2117		 brw_vec8_grf(0, 0));
2118
2119	 if (inst->target > 0) {
2120	    /* Set the render target index for choosing BLEND_STATE. */
2121	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2122			      BRW_REGISTER_TYPE_UD),
2123		    brw_imm_ud(inst->target));
2124	 }
2125
2126	 /* Clear viewport index, render target array index. */
2127	 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2128			   BRW_REGISTER_TYPE_UD),
2129		 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2130		 brw_imm_ud(0xf7ff));
2131
2132	 implied_header = brw_null_reg();
2133      } else {
2134	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2135      }
2136
2137      brw_MOV(p,
2138	      brw_message_reg(inst->base_mrf + 1),
2139	      brw_vec8_grf(1, 0));
2140   } else {
2141      implied_header = brw_null_reg();
2142   }
2143
2144   brw_pop_insn_state(p);
2145
2146   brw_fb_WRITE(p,
2147		8, /* dispatch_width */
2148		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
2149		inst->base_mrf,
2150		implied_header,
2151		inst->target,
2152		inst->mlen,
2153		0,
2154		eot,
2155		inst->header_present);
2156}
2157
2158void
2159fs_visitor::generate_linterp(fs_inst *inst,
2160			     struct brw_reg dst, struct brw_reg *src)
2161{
2162   struct brw_reg delta_x = src[0];
2163   struct brw_reg delta_y = src[1];
2164   struct brw_reg interp = src[2];
2165
2166   if (brw->has_pln &&
2167       delta_y.nr == delta_x.nr + 1 &&
2168       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2169      brw_PLN(p, dst, interp, delta_x);
2170   } else {
2171      brw_LINE(p, brw_null_reg(), interp, delta_x);
2172      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2173   }
2174}
2175
2176void
2177fs_visitor::generate_math(fs_inst *inst,
2178			  struct brw_reg dst, struct brw_reg *src)
2179{
2180   int op;
2181
2182   switch (inst->opcode) {
2183   case FS_OPCODE_RCP:
2184      op = BRW_MATH_FUNCTION_INV;
2185      break;
2186   case FS_OPCODE_RSQ:
2187      op = BRW_MATH_FUNCTION_RSQ;
2188      break;
2189   case FS_OPCODE_SQRT:
2190      op = BRW_MATH_FUNCTION_SQRT;
2191      break;
2192   case FS_OPCODE_EXP2:
2193      op = BRW_MATH_FUNCTION_EXP;
2194      break;
2195   case FS_OPCODE_LOG2:
2196      op = BRW_MATH_FUNCTION_LOG;
2197      break;
2198   case FS_OPCODE_POW:
2199      op = BRW_MATH_FUNCTION_POW;
2200      break;
2201   case FS_OPCODE_SIN:
2202      op = BRW_MATH_FUNCTION_SIN;
2203      break;
2204   case FS_OPCODE_COS:
2205      op = BRW_MATH_FUNCTION_COS;
2206      break;
2207   default:
2208      assert(!"not reached: unknown math function");
2209      op = 0;
2210      break;
2211   }
2212
2213   if (intel->gen >= 6) {
2214      assert(inst->mlen == 0);
2215
2216      if (inst->opcode == FS_OPCODE_POW) {
2217	 brw_math2(p, dst, op, src[0], src[1]);
2218      } else {
2219	 brw_math(p, dst,
2220		  op,
2221		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2222		  BRW_MATH_SATURATE_NONE,
2223		  0, src[0],
2224		  BRW_MATH_DATA_VECTOR,
2225		  BRW_MATH_PRECISION_FULL);
2226      }
2227   } else {
2228      assert(inst->mlen >= 1);
2229
2230      brw_math(p, dst,
2231	       op,
2232	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2233	       BRW_MATH_SATURATE_NONE,
2234	       inst->base_mrf, src[0],
2235	       BRW_MATH_DATA_VECTOR,
2236	       BRW_MATH_PRECISION_FULL);
2237   }
2238}
2239
2240void
2241fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2242{
2243   int msg_type = -1;
2244   int rlen = 4;
2245   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2246
2247   if (intel->gen >= 5) {
2248      switch (inst->opcode) {
2249      case FS_OPCODE_TEX:
2250	 if (inst->shadow_compare) {
2251	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
2252	 } else {
2253	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
2254	 }
2255	 break;
2256      case FS_OPCODE_TXB:
2257	 if (inst->shadow_compare) {
2258	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
2259	 } else {
2260	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
2261	 }
2262	 break;
2263      case FS_OPCODE_TXL:
2264	 if (inst->shadow_compare) {
2265	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
2266	 } else {
2267	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
2268	 }
2269	 break;
2270      case FS_OPCODE_TXD:
2271	 assert(!"TXD isn't supported on gen5+ yet.");
2272	 break;
2273      }
2274   } else {
2275      switch (inst->opcode) {
2276      case FS_OPCODE_TEX:
2277	 /* Note that G45 and older determines shadow compare and dispatch width
2278	  * from message length for most messages.
2279	  */
2280	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2281	 if (inst->shadow_compare) {
2282	    assert(inst->mlen == 6);
2283	 } else {
2284	    assert(inst->mlen <= 4);
2285	 }
2286	 break;
2287      case FS_OPCODE_TXB:
2288	 if (inst->shadow_compare) {
2289	    assert(inst->mlen == 6);
2290	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
2291	 } else {
2292	    assert(inst->mlen == 9);
2293	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2294	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2295	 }
2296	 break;
2297      case FS_OPCODE_TXL:
2298	 if (inst->shadow_compare) {
2299	    assert(inst->mlen == 6);
2300	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
2301	 } else {
2302	    assert(inst->mlen == 9);
2303	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
2304	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2305	 }
2306	 break;
2307      case FS_OPCODE_TXD:
2308	 assert(!"TXD isn't supported on gen4 yet.");
2309	 break;
2310      }
2311   }
2312   assert(msg_type != -1);
2313
2314   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2315      rlen = 8;
2316      dst = vec16(dst);
2317   }
2318
2319   brw_SAMPLE(p,
2320	      retype(dst, BRW_REGISTER_TYPE_UW),
2321	      inst->base_mrf,
2322	      src,
2323              SURF_INDEX_TEXTURE(inst->sampler),
2324	      inst->sampler,
2325	      WRITEMASK_XYZW,
2326	      msg_type,
2327	      rlen,
2328	      inst->mlen,
2329	      0,
2330	      1,
2331	      simd_mode);
2332}
2333
2334
2335/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2336 * looking like:
2337 *
2338 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2339 *
2340 * and we're trying to produce:
2341 *
2342 *           DDX                     DDY
2343 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2344 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2345 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2346 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2347 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2348 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2349 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2350 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2351 *
2352 * and add another set of two more subspans if in 16-pixel dispatch mode.
2353 *
2354 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2355 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2356 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2357 * between each other.  We could probably do it like ddx and swizzle the right
2358 * order later, but bail for now and just produce
2359 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2360 */
2361void
2362fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2363{
2364   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2365				 BRW_REGISTER_TYPE_F,
2366				 BRW_VERTICAL_STRIDE_2,
2367				 BRW_WIDTH_2,
2368				 BRW_HORIZONTAL_STRIDE_0,
2369				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2370   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2371				 BRW_REGISTER_TYPE_F,
2372				 BRW_VERTICAL_STRIDE_2,
2373				 BRW_WIDTH_2,
2374				 BRW_HORIZONTAL_STRIDE_0,
2375				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2376   brw_ADD(p, dst, src0, negate(src1));
2377}
2378
2379void
2380fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2381{
2382   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2383				 BRW_REGISTER_TYPE_F,
2384				 BRW_VERTICAL_STRIDE_4,
2385				 BRW_WIDTH_4,
2386				 BRW_HORIZONTAL_STRIDE_0,
2387				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2388   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2389				 BRW_REGISTER_TYPE_F,
2390				 BRW_VERTICAL_STRIDE_4,
2391				 BRW_WIDTH_4,
2392				 BRW_HORIZONTAL_STRIDE_0,
2393				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2394   brw_ADD(p, dst, src0, negate(src1));
2395}
2396
2397void
2398fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2399{
2400   if (intel->gen >= 6) {
2401      /* Gen6 no longer has the mask reg for us to just read the
2402       * active channels from.  However, cmp updates just the channels
2403       * of the flag reg that are enabled, so we can get at the
2404       * channel enables that way.  In this step, make a reg of ones
2405       * we'll compare to.
2406       */
2407      brw_MOV(p, mask, brw_imm_ud(1));
2408   } else {
2409      brw_push_insn_state(p);
2410      brw_set_mask_control(p, BRW_MASK_DISABLE);
2411      brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2412      brw_pop_insn_state(p);
2413   }
2414}
2415
2416void
2417fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2418{
2419   if (intel->gen >= 6) {
2420      struct brw_reg f0 = brw_flag_reg();
2421      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2422
2423      brw_push_insn_state(p);
2424      brw_set_mask_control(p, BRW_MASK_DISABLE);
2425      brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2426      brw_pop_insn_state(p);
2427
2428      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2429	      BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2430      /* Undo CMP's whacking of predication*/
2431      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2432
2433      brw_push_insn_state(p);
2434      brw_set_mask_control(p, BRW_MASK_DISABLE);
2435      brw_AND(p, g1, f0, g1);
2436      brw_pop_insn_state(p);
2437   } else {
2438      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2439
2440      mask = brw_uw1_reg(mask.file, mask.nr, 0);
2441
2442      brw_push_insn_state(p);
2443      brw_set_mask_control(p, BRW_MASK_DISABLE);
2444      brw_AND(p, g0, mask, g0);
2445      brw_pop_insn_state(p);
2446   }
2447}
2448
2449void
2450fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2451{
2452   assert(inst->mlen != 0);
2453
2454   brw_MOV(p,
2455	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2456	   retype(src, BRW_REGISTER_TYPE_UD));
2457   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2458				 inst->offset);
2459}
2460
2461void
2462fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2463{
2464   assert(inst->mlen != 0);
2465
2466   /* Clear any post destination dependencies that would be ignored by
2467    * the block read.  See the B-Spec for pre-gen5 send instruction.
2468    *
2469    * This could use a better solution, since texture sampling and
2470    * math reads could potentially run into it as well -- anywhere
2471    * that we have a SEND with a destination that is a register that
2472    * was written but not read within the last N instructions (what's
2473    * N?  unsure).  This is rare because of dead code elimination, but
2474    * not impossible.
2475    */
2476   if (intel->gen == 4 && !intel->is_g4x)
2477      brw_MOV(p, brw_null_reg(), dst);
2478
2479   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2480				inst->offset);
2481
2482   if (intel->gen == 4 && !intel->is_g4x) {
2483      /* gen4 errata: destination from a send can't be used as a
2484       * destination until it's been read.  Just read it so we don't
2485       * have to worry.
2486       */
2487      brw_MOV(p, brw_null_reg(), dst);
2488   }
2489}
2490
2491
2492void
2493fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2494{
2495   assert(inst->mlen != 0);
2496
2497   /* Clear any post destination dependencies that would be ignored by
2498    * the block read.  See the B-Spec for pre-gen5 send instruction.
2499    *
2500    * This could use a better solution, since texture sampling and
2501    * math reads could potentially run into it as well -- anywhere
2502    * that we have a SEND with a destination that is a register that
2503    * was written but not read within the last N instructions (what's
2504    * N?  unsure).  This is rare because of dead code elimination, but
2505    * not impossible.
2506    */
2507   if (intel->gen == 4 && !intel->is_g4x)
2508      brw_MOV(p, brw_null_reg(), dst);
2509
2510   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2511			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2512
2513   if (intel->gen == 4 && !intel->is_g4x) {
2514      /* gen4 errata: destination from a send can't be used as a
2515       * destination until it's been read.  Just read it so we don't
2516       * have to worry.
2517       */
2518      brw_MOV(p, brw_null_reg(), dst);
2519   }
2520}
2521
2522/**
2523 * To be called after the last _mesa_add_state_reference() call, to
2524 * set up prog_data.param[] for assign_curb_setup() and
2525 * setup_pull_constants().
2526 */
2527void
2528fs_visitor::setup_paramvalues_refs()
2529{
2530   /* Set up the pointers to ParamValues now that that array is finalized. */
2531   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2532      c->prog_data.param[i] =
2533	 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2534	 this->param_offset[i];
2535   }
2536}
2537
2538void
2539fs_visitor::assign_curb_setup()
2540{
2541   c->prog_data.first_curbe_grf = c->nr_payload_regs;
2542   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2543
2544   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2545   foreach_iter(exec_list_iterator, iter, this->instructions) {
2546      fs_inst *inst = (fs_inst *)iter.get();
2547
2548      for (unsigned int i = 0; i < 3; i++) {
2549	 if (inst->src[i].file == UNIFORM) {
2550	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2551	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2552						  constant_nr / 8,
2553						  constant_nr % 8);
2554
2555	    inst->src[i].file = FIXED_HW_REG;
2556	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2557	 }
2558      }
2559   }
2560}
2561
2562void
2563fs_visitor::calculate_urb_setup()
2564{
2565   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2566      urb_setup[i] = -1;
2567   }
2568
2569   int urb_next = 0;
2570   /* Figure out where each of the incoming setup attributes lands. */
2571   if (intel->gen >= 6) {
2572      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2573	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2574	    urb_setup[i] = urb_next++;
2575	 }
2576      }
2577   } else {
2578      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2579      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2580	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2581	    int fp_index;
2582
2583	    if (i >= VERT_RESULT_VAR0)
2584	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2585	    else if (i <= VERT_RESULT_TEX7)
2586	       fp_index = i;
2587	    else
2588	       fp_index = -1;
2589
2590	    if (fp_index >= 0)
2591	       urb_setup[fp_index] = urb_next++;
2592	 }
2593      }
2594   }
2595
2596   /* Each attribute is 4 setup channels, each of which is half a reg. */
2597   c->prog_data.urb_read_length = urb_next * 2;
2598}
2599
2600void
2601fs_visitor::assign_urb_setup()
2602{
2603   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2604
2605   /* Offset all the urb_setup[] index by the actual position of the
2606    * setup regs, now that the location of the constants has been chosen.
2607    */
2608   foreach_iter(exec_list_iterator, iter, this->instructions) {
2609      fs_inst *inst = (fs_inst *)iter.get();
2610
2611      if (inst->opcode == FS_OPCODE_LINTERP) {
2612	 assert(inst->src[2].file == FIXED_HW_REG);
2613	 inst->src[2].fixed_hw_reg.nr += urb_start;
2614      }
2615
2616      if (inst->opcode == FS_OPCODE_CINTERP) {
2617	 assert(inst->src[0].file == FIXED_HW_REG);
2618	 inst->src[0].fixed_hw_reg.nr += urb_start;
2619      }
2620   }
2621
2622   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2623}
2624
2625/**
2626 * Split large virtual GRFs into separate components if we can.
2627 *
2628 * This is mostly duplicated with what brw_fs_vector_splitting does,
2629 * but that's really conservative because it's afraid of doing
2630 * splitting that doesn't result in real progress after the rest of
2631 * the optimization phases, which would cause infinite looping in
2632 * optimization.  We can do it once here, safely.  This also has the
2633 * opportunity to split interpolated values, or maybe even uniforms,
2634 * which we don't have at the IR level.
2635 *
2636 * We want to split, because virtual GRFs are what we register
2637 * allocate and spill (due to contiguousness requirements for some
2638 * instructions), and they're what we naturally generate in the
2639 * codegen process, but most virtual GRFs don't actually need to be
2640 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2641 * live intervals and better dead code elimination and coalescing.
2642 */
2643void
2644fs_visitor::split_virtual_grfs()
2645{
2646   int num_vars = this->virtual_grf_next;
2647   bool split_grf[num_vars];
2648   int new_virtual_grf[num_vars];
2649
2650   /* Try to split anything > 0 sized. */
2651   for (int i = 0; i < num_vars; i++) {
2652      if (this->virtual_grf_sizes[i] != 1)
2653	 split_grf[i] = true;
2654      else
2655	 split_grf[i] = false;
2656   }
2657
2658   if (brw->has_pln) {
2659      /* PLN opcodes rely on the delta_xy being contiguous. */
2660      split_grf[this->delta_x.reg] = false;
2661   }
2662
2663   foreach_iter(exec_list_iterator, iter, this->instructions) {
2664      fs_inst *inst = (fs_inst *)iter.get();
2665
2666      /* Texturing produces 4 contiguous registers, so no splitting. */
2667      if (inst->is_tex()) {
2668	 split_grf[inst->dst.reg] = false;
2669      }
2670   }
2671
2672   /* Allocate new space for split regs.  Note that the virtual
2673    * numbers will be contiguous.
2674    */
2675   for (int i = 0; i < num_vars; i++) {
2676      if (split_grf[i]) {
2677	 new_virtual_grf[i] = virtual_grf_alloc(1);
2678	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2679	    int reg = virtual_grf_alloc(1);
2680	    assert(reg == new_virtual_grf[i] + j - 1);
2681	    (void) reg;
2682	 }
2683	 this->virtual_grf_sizes[i] = 1;
2684      }
2685   }
2686
2687   foreach_iter(exec_list_iterator, iter, this->instructions) {
2688      fs_inst *inst = (fs_inst *)iter.get();
2689
2690      if (inst->dst.file == GRF &&
2691	  split_grf[inst->dst.reg] &&
2692	  inst->dst.reg_offset != 0) {
2693	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2694			  inst->dst.reg_offset - 1);
2695	 inst->dst.reg_offset = 0;
2696      }
2697      for (int i = 0; i < 3; i++) {
2698	 if (inst->src[i].file == GRF &&
2699	     split_grf[inst->src[i].reg] &&
2700	     inst->src[i].reg_offset != 0) {
2701	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2702				inst->src[i].reg_offset - 1);
2703	    inst->src[i].reg_offset = 0;
2704	 }
2705      }
2706   }
2707   this->live_intervals_valid = false;
2708}
2709
2710/**
2711 * Choose accesses from the UNIFORM file to demote to using the pull
2712 * constant buffer.
2713 *
2714 * We allow a fragment shader to have more than the specified minimum
2715 * maximum number of fragment shader uniform components (64).  If
2716 * there are too many of these, they'd fill up all of register space.
2717 * So, this will push some of them out to the pull constant buffer and
2718 * update the program to load them.
2719 */
2720void
2721fs_visitor::setup_pull_constants()
2722{
2723   /* Only allow 16 registers (128 uniform components) as push constants. */
2724   unsigned int max_uniform_components = 16 * 8;
2725   if (c->prog_data.nr_params <= max_uniform_components)
2726      return;
2727
2728   /* Just demote the end of the list.  We could probably do better
2729    * here, demoting things that are rarely used in the program first.
2730    */
2731   int pull_uniform_base = max_uniform_components;
2732   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2733
2734   foreach_iter(exec_list_iterator, iter, this->instructions) {
2735      fs_inst *inst = (fs_inst *)iter.get();
2736
2737      for (int i = 0; i < 3; i++) {
2738	 if (inst->src[i].file != UNIFORM)
2739	    continue;
2740
2741	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2742	 if (uniform_nr < pull_uniform_base)
2743	    continue;
2744
2745	 fs_reg dst = fs_reg(this, glsl_type::float_type);
2746	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2747					      dst);
2748	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2749	 pull->ir = inst->ir;
2750	 pull->annotation = inst->annotation;
2751	 pull->base_mrf = 14;
2752	 pull->mlen = 1;
2753
2754	 inst->insert_before(pull);
2755
2756	 inst->src[i].file = GRF;
2757	 inst->src[i].reg = dst.reg;
2758	 inst->src[i].reg_offset = 0;
2759	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2760      }
2761   }
2762
2763   for (int i = 0; i < pull_uniform_count; i++) {
2764      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2765      c->prog_data.pull_param_convert[i] =
2766	 c->prog_data.param_convert[pull_uniform_base + i];
2767   }
2768   c->prog_data.nr_params -= pull_uniform_count;
2769   c->prog_data.nr_pull_params = pull_uniform_count;
2770}
2771
2772void
2773fs_visitor::calculate_live_intervals()
2774{
2775   int num_vars = this->virtual_grf_next;
2776   int *def = ralloc_array(mem_ctx, int, num_vars);
2777   int *use = ralloc_array(mem_ctx, int, num_vars);
2778   int loop_depth = 0;
2779   int loop_start = 0;
2780   int bb_header_ip = 0;
2781
2782   if (this->live_intervals_valid)
2783      return;
2784
2785   for (int i = 0; i < num_vars; i++) {
2786      def[i] = MAX_INSTRUCTION;
2787      use[i] = -1;
2788   }
2789
2790   int ip = 0;
2791   foreach_iter(exec_list_iterator, iter, this->instructions) {
2792      fs_inst *inst = (fs_inst *)iter.get();
2793
2794      if (inst->opcode == BRW_OPCODE_DO) {
2795	 if (loop_depth++ == 0)
2796	    loop_start = ip;
2797      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2798	 loop_depth--;
2799
2800	 if (loop_depth == 0) {
2801	    /* Patches up the use of vars marked for being live across
2802	     * the whole loop.
2803	     */
2804	    for (int i = 0; i < num_vars; i++) {
2805	       if (use[i] == loop_start) {
2806		  use[i] = ip;
2807	       }
2808	    }
2809	 }
2810      } else {
2811	 for (unsigned int i = 0; i < 3; i++) {
2812	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2813	       int reg = inst->src[i].reg;
2814
2815	       if (!loop_depth) {
2816		  use[reg] = ip;
2817	       } else {
2818		  def[reg] = MIN2(loop_start, def[reg]);
2819		  use[reg] = loop_start;
2820
2821		  /* Nobody else is going to go smash our start to
2822		   * later in the loop now, because def[reg] now
2823		   * points before the bb header.
2824		   */
2825	       }
2826	    }
2827	 }
2828	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2829	    int reg = inst->dst.reg;
2830
2831	    if (!loop_depth) {
2832	       def[reg] = MIN2(def[reg], ip);
2833	    } else {
2834	       def[reg] = MIN2(def[reg], loop_start);
2835	    }
2836	 }
2837      }
2838
2839      ip++;
2840
2841      /* Set the basic block header IP.  This is used for determining
2842       * if a complete def of single-register virtual GRF in a loop
2843       * dominates a use in the same basic block.  It's a quick way to
2844       * reduce the live interval range of most register used in a
2845       * loop.
2846       */
2847      if (inst->opcode == BRW_OPCODE_IF ||
2848	  inst->opcode == BRW_OPCODE_ELSE ||
2849	  inst->opcode == BRW_OPCODE_ENDIF ||
2850	  inst->opcode == BRW_OPCODE_DO ||
2851	  inst->opcode == BRW_OPCODE_WHILE ||
2852	  inst->opcode == BRW_OPCODE_BREAK ||
2853	  inst->opcode == BRW_OPCODE_CONTINUE) {
2854	 bb_header_ip = ip;
2855      }
2856   }
2857
2858   ralloc_free(this->virtual_grf_def);
2859   ralloc_free(this->virtual_grf_use);
2860   this->virtual_grf_def = def;
2861   this->virtual_grf_use = use;
2862
2863   this->live_intervals_valid = true;
2864}
2865
2866/**
2867 * Attempts to move immediate constants into the immediate
2868 * constant slot of following instructions.
2869 *
2870 * Immediate constants are a bit tricky -- they have to be in the last
2871 * operand slot, you can't do abs/negate on them,
2872 */
2873
2874bool
2875fs_visitor::propagate_constants()
2876{
2877   bool progress = false;
2878
2879   calculate_live_intervals();
2880
2881   foreach_iter(exec_list_iterator, iter, this->instructions) {
2882      fs_inst *inst = (fs_inst *)iter.get();
2883
2884      if (inst->opcode != BRW_OPCODE_MOV ||
2885	  inst->predicated ||
2886	  inst->dst.file != GRF || inst->src[0].file != IMM ||
2887	  inst->dst.type != inst->src[0].type)
2888	 continue;
2889
2890      /* Don't bother with cases where we should have had the
2891       * operation on the constant folded in GLSL already.
2892       */
2893      if (inst->saturate)
2894	 continue;
2895
2896      /* Found a move of a constant to a GRF.  Find anything else using the GRF
2897       * before it's written, and replace it with the constant if we can.
2898       */
2899      exec_list_iterator scan_iter = iter;
2900      scan_iter.next();
2901      for (; scan_iter.has_next(); scan_iter.next()) {
2902	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2903
2904	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2905	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2906	     scan_inst->opcode == BRW_OPCODE_ELSE ||
2907	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2908	    break;
2909	 }
2910
2911	 for (int i = 2; i >= 0; i--) {
2912	    if (scan_inst->src[i].file != GRF ||
2913		scan_inst->src[i].reg != inst->dst.reg ||
2914		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2915	       continue;
2916
2917	    /* Don't bother with cases where we should have had the
2918	     * operation on the constant folded in GLSL already.
2919	     */
2920	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2921	       continue;
2922
2923	    switch (scan_inst->opcode) {
2924	    case BRW_OPCODE_MOV:
2925	       scan_inst->src[i] = inst->src[0];
2926	       progress = true;
2927	       break;
2928
2929	    case BRW_OPCODE_MUL:
2930	    case BRW_OPCODE_ADD:
2931	       if (i == 1) {
2932		  scan_inst->src[i] = inst->src[0];
2933		  progress = true;
2934	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
2935		  /* Fit this constant in by commuting the operands */
2936		  scan_inst->src[0] = scan_inst->src[1];
2937		  scan_inst->src[1] = inst->src[0];
2938		  progress = true;
2939	       }
2940	       break;
2941	    case BRW_OPCODE_CMP:
2942	    case BRW_OPCODE_SEL:
2943	       if (i == 1) {
2944		  scan_inst->src[i] = inst->src[0];
2945		  progress = true;
2946	       }
2947	    }
2948	 }
2949
2950	 if (scan_inst->dst.file == GRF &&
2951	     scan_inst->dst.reg == inst->dst.reg &&
2952	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2953	      scan_inst->is_tex())) {
2954	    break;
2955	 }
2956      }
2957   }
2958
2959   if (progress)
2960       this->live_intervals_valid = false;
2961
2962   return progress;
2963}
2964/**
2965 * Must be called after calculate_live_intervales() to remove unused
2966 * writes to registers -- register allocation will fail otherwise
2967 * because something deffed but not used won't be considered to
2968 * interfere with other regs.
2969 */
2970bool
2971fs_visitor::dead_code_eliminate()
2972{
2973   bool progress = false;
2974   int pc = 0;
2975
2976   calculate_live_intervals();
2977
2978   foreach_iter(exec_list_iterator, iter, this->instructions) {
2979      fs_inst *inst = (fs_inst *)iter.get();
2980
2981      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
2982	 inst->remove();
2983	 progress = true;
2984      }
2985
2986      pc++;
2987   }
2988
2989   if (progress)
2990      live_intervals_valid = false;
2991
2992   return progress;
2993}
2994
2995bool
2996fs_visitor::register_coalesce()
2997{
2998   bool progress = false;
2999   int if_depth = 0;
3000   int loop_depth = 0;
3001
3002   foreach_iter(exec_list_iterator, iter, this->instructions) {
3003      fs_inst *inst = (fs_inst *)iter.get();
3004
3005      /* Make sure that we dominate the instructions we're going to
3006       * scan for interfering with our coalescing, or we won't have
3007       * scanned enough to see if anything interferes with our
3008       * coalescing.  We don't dominate the following instructions if
3009       * we're in a loop or an if block.
3010       */
3011      switch (inst->opcode) {
3012      case BRW_OPCODE_DO:
3013	 loop_depth++;
3014	 break;
3015      case BRW_OPCODE_WHILE:
3016	 loop_depth--;
3017	 break;
3018      case BRW_OPCODE_IF:
3019	 if_depth++;
3020	 break;
3021      case BRW_OPCODE_ENDIF:
3022	 if_depth--;
3023	 break;
3024      }
3025      if (loop_depth || if_depth)
3026	 continue;
3027
3028      if (inst->opcode != BRW_OPCODE_MOV ||
3029	  inst->predicated ||
3030	  inst->saturate ||
3031	  inst->dst.file != GRF || inst->src[0].file != GRF ||
3032	  inst->dst.type != inst->src[0].type)
3033	 continue;
3034
3035      bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
3036
3037      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
3038       * them: check for no writes to either one until the exit of the
3039       * program.
3040       */
3041      bool interfered = false;
3042      exec_list_iterator scan_iter = iter;
3043      scan_iter.next();
3044      for (; scan_iter.has_next(); scan_iter.next()) {
3045	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3046
3047	 if (scan_inst->dst.file == GRF) {
3048	    if (scan_inst->dst.reg == inst->dst.reg &&
3049		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3050		 scan_inst->is_tex())) {
3051	       interfered = true;
3052	       break;
3053	    }
3054	    if (scan_inst->dst.reg == inst->src[0].reg &&
3055		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3056		 scan_inst->is_tex())) {
3057	       interfered = true;
3058	       break;
3059	    }
3060	 }
3061
3062	 /* The gen6 MATH instruction can't handle source modifiers, so avoid
3063	  * coalescing those for now.  We should do something more specific.
3064	  */
3065	 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) {
3066	    interfered = true;
3067	    break;
3068	 }
3069      }
3070      if (interfered) {
3071	 continue;
3072      }
3073
3074      /* Rewrite the later usage to point at the source of the move to
3075       * be removed.
3076       */
3077      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3078	   scan_iter.next()) {
3079	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3080
3081	 for (int i = 0; i < 3; i++) {
3082	    if (scan_inst->src[i].file == GRF &&
3083		scan_inst->src[i].reg == inst->dst.reg &&
3084		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3085	       scan_inst->src[i].reg = inst->src[0].reg;
3086	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3087	       scan_inst->src[i].abs |= inst->src[0].abs;
3088	       scan_inst->src[i].negate ^= inst->src[0].negate;
3089	       scan_inst->src[i].smear = inst->src[0].smear;
3090	    }
3091	 }
3092      }
3093
3094      inst->remove();
3095      progress = true;
3096   }
3097
3098   if (progress)
3099      live_intervals_valid = false;
3100
3101   return progress;
3102}
3103
3104
3105bool
3106fs_visitor::compute_to_mrf()
3107{
3108   bool progress = false;
3109   int next_ip = 0;
3110
3111   calculate_live_intervals();
3112
3113   foreach_iter(exec_list_iterator, iter, this->instructions) {
3114      fs_inst *inst = (fs_inst *)iter.get();
3115
3116      int ip = next_ip;
3117      next_ip++;
3118
3119      if (inst->opcode != BRW_OPCODE_MOV ||
3120	  inst->predicated ||
3121	  inst->dst.file != MRF || inst->src[0].file != GRF ||
3122	  inst->dst.type != inst->src[0].type ||
3123	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3124	 continue;
3125
3126      /* Can't compute-to-MRF this GRF if someone else was going to
3127       * read it later.
3128       */
3129      if (this->virtual_grf_use[inst->src[0].reg] > ip)
3130	 continue;
3131
3132      /* Found a move of a GRF to a MRF.  Let's see if we can go
3133       * rewrite the thing that made this GRF to write into the MRF.
3134       */
3135      fs_inst *scan_inst;
3136      for (scan_inst = (fs_inst *)inst->prev;
3137	   scan_inst->prev != NULL;
3138	   scan_inst = (fs_inst *)scan_inst->prev) {
3139	 if (scan_inst->dst.file == GRF &&
3140	     scan_inst->dst.reg == inst->src[0].reg) {
3141	    /* Found the last thing to write our reg we want to turn
3142	     * into a compute-to-MRF.
3143	     */
3144
3145	    if (scan_inst->is_tex()) {
3146	       /* texturing writes several continuous regs, so we can't
3147		* compute-to-mrf that.
3148		*/
3149	       break;
3150	    }
3151
3152	    /* If it's predicated, it (probably) didn't populate all
3153	     * the channels.
3154	     */
3155	    if (scan_inst->predicated)
3156	       break;
3157
3158	    /* SEND instructions can't have MRF as a destination. */
3159	    if (scan_inst->mlen)
3160	       break;
3161
3162	    if (intel->gen >= 6) {
3163	       /* gen6 math instructions must have the destination be
3164		* GRF, so no compute-to-MRF for them.
3165		*/
3166	       if (scan_inst->is_math()) {
3167		  break;
3168	       }
3169	    }
3170
3171	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3172	       /* Found the creator of our MRF's source value. */
3173	       scan_inst->dst.file = MRF;
3174	       scan_inst->dst.hw_reg = inst->dst.hw_reg;
3175	       scan_inst->saturate |= inst->saturate;
3176	       inst->remove();
3177	       progress = true;
3178	    }
3179	    break;
3180	 }
3181
3182	 /* We don't handle flow control here.  Most computation of
3183	  * values that end up in MRFs are shortly before the MRF
3184	  * write anyway.
3185	  */
3186	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3187	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3188	     scan_inst->opcode == BRW_OPCODE_ELSE ||
3189	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3190	    break;
3191	 }
3192
3193	 /* You can't read from an MRF, so if someone else reads our
3194	  * MRF's source GRF that we wanted to rewrite, that stops us.
3195	  */
3196	 bool interfered = false;
3197	 for (int i = 0; i < 3; i++) {
3198	    if (scan_inst->src[i].file == GRF &&
3199		scan_inst->src[i].reg == inst->src[0].reg &&
3200		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3201	       interfered = true;
3202	    }
3203	 }
3204	 if (interfered)
3205	    break;
3206
3207	 if (scan_inst->dst.file == MRF &&
3208	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3209	    /* Somebody else wrote our MRF here, so we can't can't
3210	     * compute-to-MRF before that.
3211	     */
3212	    break;
3213	 }
3214
3215	 if (scan_inst->mlen > 0) {
3216	    /* Found a SEND instruction, which means that there are
3217	     * live values in MRFs from base_mrf to base_mrf +
3218	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3219	     * above it.
3220	     */
3221	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3222		inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3223	       break;
3224	    }
3225	 }
3226      }
3227   }
3228
3229   return progress;
3230}
3231
3232/**
3233 * Walks through basic blocks, locking for repeated MRF writes and
3234 * removing the later ones.
3235 */
3236bool
3237fs_visitor::remove_duplicate_mrf_writes()
3238{
3239   fs_inst *last_mrf_move[16];
3240   bool progress = false;
3241
3242   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3243
3244   foreach_iter(exec_list_iterator, iter, this->instructions) {
3245      fs_inst *inst = (fs_inst *)iter.get();
3246
3247      switch (inst->opcode) {
3248      case BRW_OPCODE_DO:
3249      case BRW_OPCODE_WHILE:
3250      case BRW_OPCODE_IF:
3251      case BRW_OPCODE_ELSE:
3252      case BRW_OPCODE_ENDIF:
3253	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3254	 continue;
3255      default:
3256	 break;
3257      }
3258
3259      if (inst->opcode == BRW_OPCODE_MOV &&
3260	  inst->dst.file == MRF) {
3261	 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3262	 if (prev_inst && inst->equals(prev_inst)) {
3263	    inst->remove();
3264	    progress = true;
3265	    continue;
3266	 }
3267      }
3268
3269      /* Clear out the last-write records for MRFs that were overwritten. */
3270      if (inst->dst.file == MRF) {
3271	 last_mrf_move[inst->dst.hw_reg] = NULL;
3272      }
3273
3274      if (inst->mlen > 0) {
3275	 /* Found a SEND instruction, which will include two or fewer
3276	  * implied MRF writes.  We could do better here.
3277	  */
3278	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3279	    last_mrf_move[inst->base_mrf + i] = NULL;
3280	 }
3281      }
3282
3283      /* Clear out any MRF move records whose sources got overwritten. */
3284      if (inst->dst.file == GRF) {
3285	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3286	    if (last_mrf_move[i] &&
3287		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3288	       last_mrf_move[i] = NULL;
3289	    }
3290	 }
3291      }
3292
3293      if (inst->opcode == BRW_OPCODE_MOV &&
3294	  inst->dst.file == MRF &&
3295	  inst->src[0].file == GRF &&
3296	  !inst->predicated) {
3297	 last_mrf_move[inst->dst.hw_reg] = inst;
3298      }
3299   }
3300
3301   return progress;
3302}
3303
3304bool
3305fs_visitor::virtual_grf_interferes(int a, int b)
3306{
3307   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3308   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3309
3310   /* We can't handle dead register writes here, without iterating
3311    * over the whole instruction stream to find every single dead
3312    * write to that register to compare to the live interval of the
3313    * other register.  Just assert that dead_code_eliminate() has been
3314    * called.
3315    */
3316   assert((this->virtual_grf_use[a] != -1 ||
3317	   this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3318	  (this->virtual_grf_use[b] != -1 ||
3319	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
3320
3321   return start < end;
3322}
3323
3324static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3325{
3326   struct brw_reg brw_reg;
3327
3328   switch (reg->file) {
3329   case GRF:
3330   case ARF:
3331   case MRF:
3332      if (reg->smear == -1) {
3333	 brw_reg = brw_vec8_reg(reg->file,
3334				reg->hw_reg, 0);
3335      } else {
3336	 brw_reg = brw_vec1_reg(reg->file,
3337				reg->hw_reg, reg->smear);
3338      }
3339      brw_reg = retype(brw_reg, reg->type);
3340      break;
3341   case IMM:
3342      switch (reg->type) {
3343      case BRW_REGISTER_TYPE_F:
3344	 brw_reg = brw_imm_f(reg->imm.f);
3345	 break;
3346      case BRW_REGISTER_TYPE_D:
3347	 brw_reg = brw_imm_d(reg->imm.i);
3348	 break;
3349      case BRW_REGISTER_TYPE_UD:
3350	 brw_reg = brw_imm_ud(reg->imm.u);
3351	 break;
3352      default:
3353	 assert(!"not reached");
3354	 brw_reg = brw_null_reg();
3355	 break;
3356      }
3357      break;
3358   case FIXED_HW_REG:
3359      brw_reg = reg->fixed_hw_reg;
3360      break;
3361   case BAD_FILE:
3362      /* Probably unused. */
3363      brw_reg = brw_null_reg();
3364      break;
3365   case UNIFORM:
3366      assert(!"not reached");
3367      brw_reg = brw_null_reg();
3368      break;
3369   default:
3370      assert(!"not reached");
3371      brw_reg = brw_null_reg();
3372      break;
3373   }
3374   if (reg->abs)
3375      brw_reg = brw_abs(brw_reg);
3376   if (reg->negate)
3377      brw_reg = negate(brw_reg);
3378
3379   return brw_reg;
3380}
3381
3382void
3383fs_visitor::generate_code()
3384{
3385   int last_native_inst = 0;
3386   const char *last_annotation_string = NULL;
3387   ir_instruction *last_annotation_ir = NULL;
3388
3389   int if_stack_array_size = 16;
3390   int loop_stack_array_size = 16;
3391   int if_stack_depth = 0, loop_stack_depth = 0;
3392   brw_instruction **if_stack =
3393      rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size);
3394   brw_instruction **loop_stack =
3395      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
3396   int *if_depth_in_loop =
3397      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
3398
3399
3400   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3401      printf("Native code for fragment shader %d:\n",
3402	     ctx->Shader.CurrentFragmentProgram->Name);
3403   }
3404
3405   foreach_iter(exec_list_iterator, iter, this->instructions) {
3406      fs_inst *inst = (fs_inst *)iter.get();
3407      struct brw_reg src[3], dst;
3408
3409      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3410	 if (last_annotation_ir != inst->ir) {
3411	    last_annotation_ir = inst->ir;
3412	    if (last_annotation_ir) {
3413	       printf("   ");
3414	       last_annotation_ir->print();
3415	       printf("\n");
3416	    }
3417	 }
3418	 if (last_annotation_string != inst->annotation) {
3419	    last_annotation_string = inst->annotation;
3420	    if (last_annotation_string)
3421	       printf("   %s\n", last_annotation_string);
3422	 }
3423      }
3424
3425      for (unsigned int i = 0; i < 3; i++) {
3426	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3427      }
3428      dst = brw_reg_from_fs_reg(&inst->dst);
3429
3430      brw_set_conditionalmod(p, inst->conditional_mod);
3431      brw_set_predicate_control(p, inst->predicated);
3432      brw_set_saturate(p, inst->saturate);
3433
3434      switch (inst->opcode) {
3435      case BRW_OPCODE_MOV:
3436	 brw_MOV(p, dst, src[0]);
3437	 break;
3438      case BRW_OPCODE_ADD:
3439	 brw_ADD(p, dst, src[0], src[1]);
3440	 break;
3441      case BRW_OPCODE_MUL:
3442	 brw_MUL(p, dst, src[0], src[1]);
3443	 break;
3444
3445      case BRW_OPCODE_FRC:
3446	 brw_FRC(p, dst, src[0]);
3447	 break;
3448      case BRW_OPCODE_RNDD:
3449	 brw_RNDD(p, dst, src[0]);
3450	 break;
3451      case BRW_OPCODE_RNDE:
3452	 brw_RNDE(p, dst, src[0]);
3453	 break;
3454      case BRW_OPCODE_RNDZ:
3455	 brw_RNDZ(p, dst, src[0]);
3456	 break;
3457
3458      case BRW_OPCODE_AND:
3459	 brw_AND(p, dst, src[0], src[1]);
3460	 break;
3461      case BRW_OPCODE_OR:
3462	 brw_OR(p, dst, src[0], src[1]);
3463	 break;
3464      case BRW_OPCODE_XOR:
3465	 brw_XOR(p, dst, src[0], src[1]);
3466	 break;
3467      case BRW_OPCODE_NOT:
3468	 brw_NOT(p, dst, src[0]);
3469	 break;
3470      case BRW_OPCODE_ASR:
3471	 brw_ASR(p, dst, src[0], src[1]);
3472	 break;
3473      case BRW_OPCODE_SHR:
3474	 brw_SHR(p, dst, src[0], src[1]);
3475	 break;
3476      case BRW_OPCODE_SHL:
3477	 brw_SHL(p, dst, src[0], src[1]);
3478	 break;
3479
3480      case BRW_OPCODE_CMP:
3481	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3482	 break;
3483      case BRW_OPCODE_SEL:
3484	 brw_SEL(p, dst, src[0], src[1]);
3485	 break;
3486
3487      case BRW_OPCODE_IF:
3488	 if (inst->src[0].file != BAD_FILE) {
3489	    assert(intel->gen >= 6);
3490	    if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]);
3491	 } else {
3492	    if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3493	 }
3494	 if_depth_in_loop[loop_stack_depth]++;
3495	 if_stack_depth++;
3496	 if (if_stack_array_size <= if_stack_depth) {
3497	    if_stack_array_size *= 2;
3498	    if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *,
3499			        if_stack_array_size);
3500	 }
3501	 break;
3502
3503      case BRW_OPCODE_ELSE:
3504	 if_stack[if_stack_depth - 1] =
3505	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
3506	 break;
3507      case BRW_OPCODE_ENDIF:
3508	 if_stack_depth--;
3509	 brw_ENDIF(p , if_stack[if_stack_depth]);
3510	 if_depth_in_loop[loop_stack_depth]--;
3511	 break;
3512
3513      case BRW_OPCODE_DO:
3514	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3515	 if (loop_stack_array_size <= loop_stack_depth) {
3516	    loop_stack_array_size *= 2;
3517	    loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
3518				  loop_stack_array_size);
3519	    if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
3520				        loop_stack_array_size);
3521	 }
3522	 if_depth_in_loop[loop_stack_depth] = 0;
3523	 break;
3524
3525      case BRW_OPCODE_BREAK:
3526	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3527	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3528	 break;
3529      case BRW_OPCODE_CONTINUE:
3530	 /* FINISHME: We need to write the loop instruction support still. */
3531	 if (intel->gen >= 6)
3532	    gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
3533	 else
3534	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3535	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3536	 break;
3537
3538      case BRW_OPCODE_WHILE: {
3539	 struct brw_instruction *inst0, *inst1;
3540	 GLuint br = 1;
3541
3542	 if (intel->gen >= 5)
3543	    br = 2;
3544
3545	 assert(loop_stack_depth > 0);
3546	 loop_stack_depth--;
3547	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3548	 if (intel->gen < 6) {
3549	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
3550	    while (inst0 > loop_stack[loop_stack_depth]) {
3551	       inst0--;
3552	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3553		   inst0->bits3.if_else.jump_count == 0) {
3554		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3555	    }
3556	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3557			inst0->bits3.if_else.jump_count == 0) {
3558		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3559	       }
3560	    }
3561	 }
3562      }
3563	 break;
3564
3565      case FS_OPCODE_RCP:
3566      case FS_OPCODE_RSQ:
3567      case FS_OPCODE_SQRT:
3568      case FS_OPCODE_EXP2:
3569      case FS_OPCODE_LOG2:
3570      case FS_OPCODE_POW:
3571      case FS_OPCODE_SIN:
3572      case FS_OPCODE_COS:
3573	 generate_math(inst, dst, src);
3574	 break;
3575      case FS_OPCODE_CINTERP:
3576	 brw_MOV(p, dst, src[0]);
3577	 break;
3578      case FS_OPCODE_LINTERP:
3579	 generate_linterp(inst, dst, src);
3580	 break;
3581      case FS_OPCODE_TEX:
3582      case FS_OPCODE_TXB:
3583      case FS_OPCODE_TXD:
3584      case FS_OPCODE_TXL:
3585	 generate_tex(inst, dst, src[0]);
3586	 break;
3587      case FS_OPCODE_DISCARD_NOT:
3588	 generate_discard_not(inst, dst);
3589	 break;
3590      case FS_OPCODE_DISCARD_AND:
3591	 generate_discard_and(inst, src[0]);
3592	 break;
3593      case FS_OPCODE_DDX:
3594	 generate_ddx(inst, dst, src[0]);
3595	 break;
3596      case FS_OPCODE_DDY:
3597	 generate_ddy(inst, dst, src[0]);
3598	 break;
3599
3600      case FS_OPCODE_SPILL:
3601	 generate_spill(inst, src[0]);
3602	 break;
3603
3604      case FS_OPCODE_UNSPILL:
3605	 generate_unspill(inst, dst);
3606	 break;
3607
3608      case FS_OPCODE_PULL_CONSTANT_LOAD:
3609	 generate_pull_constant_load(inst, dst);
3610	 break;
3611
3612      case FS_OPCODE_FB_WRITE:
3613	 generate_fb_write(inst);
3614	 break;
3615      default:
3616	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3617	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3618			  brw_opcodes[inst->opcode].name);
3619	 } else {
3620	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3621	 }
3622	 fail("unsupported opcode in FS\n");
3623      }
3624
3625      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3626	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3627	    if (0) {
3628	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3629		      ((uint32_t *)&p->store[i])[3],
3630		      ((uint32_t *)&p->store[i])[2],
3631		      ((uint32_t *)&p->store[i])[1],
3632		      ((uint32_t *)&p->store[i])[0]);
3633	    }
3634	    brw_disasm(stdout, &p->store[i], intel->gen);
3635	 }
3636      }
3637
3638      last_native_inst = p->nr_insn;
3639   }
3640
3641   ralloc_free(if_stack);
3642   ralloc_free(loop_stack);
3643   ralloc_free(if_depth_in_loop);
3644
3645   brw_set_uip_jip(p);
3646
3647   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3648    * emit issues, it doesn't get the jump distances into the output,
3649    * which is often something we want to debug.  So this is here in
3650    * case you're doing that.
3651    */
3652   if (0) {
3653      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3654	 for (unsigned int i = 0; i < p->nr_insn; i++) {
3655	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3656		   ((uint32_t *)&p->store[i])[3],
3657		   ((uint32_t *)&p->store[i])[2],
3658		   ((uint32_t *)&p->store[i])[1],
3659		   ((uint32_t *)&p->store[i])[0]);
3660	    brw_disasm(stdout, &p->store[i], intel->gen);
3661	 }
3662      }
3663   }
3664}
3665
3666GLboolean
3667brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3668{
3669   struct intel_context *intel = &brw->intel;
3670   struct gl_context *ctx = &intel->ctx;
3671   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3672
3673   if (!prog)
3674      return GL_FALSE;
3675
3676   struct brw_shader *shader =
3677     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3678   if (!shader)
3679      return GL_FALSE;
3680
3681   /* We always use 8-wide mode, at least for now.  For one, flow
3682    * control only works in 8-wide.  Also, when we're fragment shader
3683    * bound, we're almost always under register pressure as well, so
3684    * 8-wide would save us from the performance cliff of spilling
3685    * regs.
3686    */
3687   c->dispatch_width = 8;
3688
3689   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3690      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3691      _mesa_print_ir(shader->ir, NULL);
3692      printf("\n");
3693   }
3694
3695   /* Now the main event: Visit the shader IR and generate our FS IR for it.
3696    */
3697   fs_visitor v(c, shader);
3698
3699   if (0) {
3700      v.emit_dummy_fs();
3701   } else {
3702      v.calculate_urb_setup();
3703      if (intel->gen < 6)
3704	 v.emit_interpolation_setup_gen4();
3705      else
3706	 v.emit_interpolation_setup_gen6();
3707
3708      /* Generate FS IR for main().  (the visitor only descends into
3709       * functions called "main").
3710       */
3711      foreach_iter(exec_list_iterator, iter, *shader->ir) {
3712	 ir_instruction *ir = (ir_instruction *)iter.get();
3713	 v.base_ir = ir;
3714	 ir->accept(&v);
3715      }
3716
3717      v.emit_fb_writes();
3718
3719      v.split_virtual_grfs();
3720
3721      v.setup_paramvalues_refs();
3722      v.setup_pull_constants();
3723
3724      bool progress;
3725      do {
3726	 progress = false;
3727
3728	 progress = v.remove_duplicate_mrf_writes() || progress;
3729
3730	 progress = v.propagate_constants() || progress;
3731	 progress = v.register_coalesce() || progress;
3732	 progress = v.compute_to_mrf() || progress;
3733	 progress = v.dead_code_eliminate() || progress;
3734      } while (progress);
3735
3736      v.schedule_instructions();
3737
3738      v.assign_curb_setup();
3739      v.assign_urb_setup();
3740
3741      if (0) {
3742	 /* Debug of register spilling: Go spill everything. */
3743	 int virtual_grf_count = v.virtual_grf_next;
3744	 for (int i = 1; i < virtual_grf_count; i++) {
3745	    v.spill_reg(i);
3746	 }
3747      }
3748
3749      if (0)
3750	 v.assign_regs_trivial();
3751      else {
3752	 while (!v.assign_regs()) {
3753	    if (v.failed)
3754	       break;
3755	 }
3756      }
3757   }
3758
3759   if (!v.failed)
3760      v.generate_code();
3761
3762   assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */
3763
3764   if (v.failed)
3765      return GL_FALSE;
3766
3767   c->prog_data.total_grf = v.grf_used;
3768
3769   return GL_TRUE;
3770}
3771