brw_fs.cpp revision 252eaa765e69a70036ec33a7e1e0ffeac1aab2ff
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44}
45#include "brw_fs.h"
46#include "../glsl/glsl_types.h"
47#include "../glsl/ir_optimization.h"
48#include "../glsl/ir_print_visitor.h"
49
50#define MAX_INSTRUCTION (1 << 30)
51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53struct gl_shader *
54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55{
56   struct brw_shader *shader;
57
58   shader = rzalloc(NULL, struct brw_shader);
59   if (shader) {
60      shader->base.Type = type;
61      shader->base.Name = name;
62      _mesa_init_shader(ctx, &shader->base);
63   }
64
65   return &shader->base;
66}
67
68struct gl_shader_program *
69brw_new_shader_program(struct gl_context *ctx, GLuint name)
70{
71   struct brw_shader_program *prog;
72   prog = rzalloc(NULL, struct brw_shader_program);
73   if (prog) {
74      prog->base.Name = name;
75      _mesa_init_shader_program(ctx, &prog->base);
76   }
77   return &prog->base;
78}
79
80GLboolean
81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
82{
83   struct brw_context *brw = brw_context(ctx);
84   struct intel_context *intel = &brw->intel;
85
86   struct brw_shader *shader =
87      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
88   if (shader != NULL) {
89      void *mem_ctx = ralloc_context(NULL);
90      bool progress;
91
92      if (shader->ir)
93	 ralloc_free(shader->ir);
94      shader->ir = new(shader) exec_list;
95      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
96
97      do_mat_op_to_vec(shader->ir);
98      lower_instructions(shader->ir,
99			 MOD_TO_FRACT |
100			 DIV_TO_MUL_RCP |
101			 SUB_TO_ADD_NEG |
102			 EXP_TO_EXP2 |
103			 LOG_TO_LOG2);
104
105      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
106       * if-statements need to be flattened.
107       */
108      if (intel->gen < 6)
109	 lower_if_to_cond_assign(shader->ir, 16);
110
111      do_lower_texture_projection(shader->ir);
112      do_vec_index_to_cond_assign(shader->ir);
113      brw_do_cubemap_normalize(shader->ir);
114      lower_noise(shader->ir);
115      lower_quadop_vector(shader->ir, false);
116      lower_variable_index_to_cond_assign(shader->ir,
117					  GL_TRUE, /* input */
118					  GL_TRUE, /* output */
119					  GL_TRUE, /* temp */
120					  GL_TRUE /* uniform */
121					  );
122
123      do {
124	 progress = false;
125
126	 brw_do_channel_expressions(shader->ir);
127	 brw_do_vector_splitting(shader->ir);
128
129	 progress = do_lower_jumps(shader->ir, true, true,
130				   true, /* main return */
131				   false, /* continue */
132				   false /* loops */
133				   ) || progress;
134
135	 progress = do_common_optimization(shader->ir, true, 32) || progress;
136      } while (progress);
137
138      validate_ir_tree(shader->ir);
139
140      reparent_ir(shader->ir, shader->ir);
141      ralloc_free(mem_ctx);
142   }
143
144   if (!_mesa_ir_link_shader(ctx, prog))
145      return GL_FALSE;
146
147   return GL_TRUE;
148}
149
150static int
151type_size(const struct glsl_type *type)
152{
153   unsigned int size, i;
154
155   switch (type->base_type) {
156   case GLSL_TYPE_UINT:
157   case GLSL_TYPE_INT:
158   case GLSL_TYPE_FLOAT:
159   case GLSL_TYPE_BOOL:
160      return type->components();
161   case GLSL_TYPE_ARRAY:
162      return type_size(type->fields.array) * type->length;
163   case GLSL_TYPE_STRUCT:
164      size = 0;
165      for (i = 0; i < type->length; i++) {
166	 size += type_size(type->fields.structure[i].type);
167      }
168      return size;
169   case GLSL_TYPE_SAMPLER:
170      /* Samplers take up no register space, since they're baked in at
171       * link time.
172       */
173      return 0;
174   default:
175      assert(!"not reached");
176      return 0;
177   }
178}
179
180void
181fs_visitor::fail(const char *format, ...)
182{
183   if (!failed) {
184      failed = true;
185
186      if (INTEL_DEBUG & DEBUG_WM) {
187	 fprintf(stderr, "FS compile failed: ");
188
189	 va_list va;
190	 va_start(va, format);
191	 vfprintf(stderr, format, va);
192	 va_end(va);
193      }
194   }
195}
196
197/**
198 * Returns how many MRFs an FS opcode will write over.
199 *
200 * Note that this is not the 0 or 1 implied writes in an actual gen
201 * instruction -- the FS opcodes often generate MOVs in addition.
202 */
203int
204fs_visitor::implied_mrf_writes(fs_inst *inst)
205{
206   if (inst->mlen == 0)
207      return 0;
208
209   switch (inst->opcode) {
210   case FS_OPCODE_RCP:
211   case FS_OPCODE_RSQ:
212   case FS_OPCODE_SQRT:
213   case FS_OPCODE_EXP2:
214   case FS_OPCODE_LOG2:
215   case FS_OPCODE_SIN:
216   case FS_OPCODE_COS:
217      return 1;
218   case FS_OPCODE_POW:
219      return 2;
220   case FS_OPCODE_TEX:
221   case FS_OPCODE_TXB:
222   case FS_OPCODE_TXD:
223   case FS_OPCODE_TXL:
224      return 1;
225   case FS_OPCODE_FB_WRITE:
226      return 2;
227   case FS_OPCODE_PULL_CONSTANT_LOAD:
228   case FS_OPCODE_UNSPILL:
229      return 1;
230   case FS_OPCODE_SPILL:
231      return 2;
232   default:
233      assert(!"not reached");
234      return inst->mlen;
235   }
236}
237
238int
239fs_visitor::virtual_grf_alloc(int size)
240{
241   if (virtual_grf_array_size <= virtual_grf_next) {
242      if (virtual_grf_array_size == 0)
243	 virtual_grf_array_size = 16;
244      else
245	 virtual_grf_array_size *= 2;
246      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
247				   virtual_grf_array_size);
248
249      /* This slot is always unused. */
250      virtual_grf_sizes[0] = 0;
251   }
252   virtual_grf_sizes[virtual_grf_next] = size;
253   return virtual_grf_next++;
254}
255
256/** Fixed HW reg constructor. */
257fs_reg::fs_reg(enum register_file file, int hw_reg)
258{
259   init();
260   this->file = file;
261   this->hw_reg = hw_reg;
262   this->type = BRW_REGISTER_TYPE_F;
263}
264
265/** Fixed HW reg constructor. */
266fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
267{
268   init();
269   this->file = file;
270   this->hw_reg = hw_reg;
271   this->type = type;
272}
273
274int
275brw_type_for_base_type(const struct glsl_type *type)
276{
277   switch (type->base_type) {
278   case GLSL_TYPE_FLOAT:
279      return BRW_REGISTER_TYPE_F;
280   case GLSL_TYPE_INT:
281   case GLSL_TYPE_BOOL:
282      return BRW_REGISTER_TYPE_D;
283   case GLSL_TYPE_UINT:
284      return BRW_REGISTER_TYPE_UD;
285   case GLSL_TYPE_ARRAY:
286   case GLSL_TYPE_STRUCT:
287   case GLSL_TYPE_SAMPLER:
288      /* These should be overridden with the type of the member when
289       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
290       * way to trip up if we don't.
291       */
292      return BRW_REGISTER_TYPE_UD;
293   default:
294      assert(!"not reached");
295      return BRW_REGISTER_TYPE_F;
296   }
297}
298
299/** Automatic reg constructor. */
300fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
301{
302   init();
303
304   this->file = GRF;
305   this->reg = v->virtual_grf_alloc(type_size(type));
306   this->reg_offset = 0;
307   this->type = brw_type_for_base_type(type);
308}
309
310fs_reg *
311fs_visitor::variable_storage(ir_variable *var)
312{
313   return (fs_reg *)hash_table_find(this->variable_ht, var);
314}
315
316/* Our support for uniforms is piggy-backed on the struct
317 * gl_fragment_program, because that's where the values actually
318 * get stored, rather than in some global gl_shader_program uniform
319 * store.
320 */
321int
322fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
323{
324   unsigned int offset = 0;
325
326   if (type->is_matrix()) {
327      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
328							type->vector_elements,
329							1);
330
331      for (unsigned int i = 0; i < type->matrix_columns; i++) {
332	 offset += setup_uniform_values(loc + offset, column);
333      }
334
335      return offset;
336   }
337
338   switch (type->base_type) {
339   case GLSL_TYPE_FLOAT:
340   case GLSL_TYPE_UINT:
341   case GLSL_TYPE_INT:
342   case GLSL_TYPE_BOOL:
343      for (unsigned int i = 0; i < type->vector_elements; i++) {
344	 unsigned int param = c->prog_data.nr_params++;
345
346	 assert(param < ARRAY_SIZE(c->prog_data.param));
347
348	 switch (type->base_type) {
349	 case GLSL_TYPE_FLOAT:
350	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
351	    break;
352	 case GLSL_TYPE_UINT:
353	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
354	    break;
355	 case GLSL_TYPE_INT:
356	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
357	    break;
358	 case GLSL_TYPE_BOOL:
359	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
360	    break;
361	 default:
362	    assert(!"not reached");
363	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
364	    break;
365	 }
366	 this->param_index[param] = loc;
367	 this->param_offset[param] = i;
368      }
369      return 1;
370
371   case GLSL_TYPE_STRUCT:
372      for (unsigned int i = 0; i < type->length; i++) {
373	 offset += setup_uniform_values(loc + offset,
374					type->fields.structure[i].type);
375      }
376      return offset;
377
378   case GLSL_TYPE_ARRAY:
379      for (unsigned int i = 0; i < type->length; i++) {
380	 offset += setup_uniform_values(loc + offset, type->fields.array);
381      }
382      return offset;
383
384   case GLSL_TYPE_SAMPLER:
385      /* The sampler takes up a slot, but we don't use any values from it. */
386      return 1;
387
388   default:
389      assert(!"not reached");
390      return 0;
391   }
392}
393
394
395/* Our support for builtin uniforms is even scarier than non-builtin.
396 * It sits on top of the PROG_STATE_VAR parameters that are
397 * automatically updated from GL context state.
398 */
399void
400fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
401{
402   const struct gl_builtin_uniform_desc *statevar = NULL;
403
404   for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
405      statevar = &_mesa_builtin_uniform_desc[i];
406      if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
407	 break;
408   }
409
410   if (!statevar->name) {
411      fail("Failed to find builtin uniform `%s'\n", ir->name);
412      return;
413   }
414
415   int array_count;
416   if (ir->type->is_array()) {
417      array_count = ir->type->length;
418   } else {
419      array_count = 1;
420   }
421
422   for (int a = 0; a < array_count; a++) {
423      for (unsigned int i = 0; i < statevar->num_elements; i++) {
424	 struct gl_builtin_uniform_element *element = &statevar->elements[i];
425	 int tokens[STATE_LENGTH];
426
427	 memcpy(tokens, element->tokens, sizeof(element->tokens));
428	 if (ir->type->is_array()) {
429	    tokens[1] = a;
430	 }
431
432	 /* This state reference has already been setup by ir_to_mesa,
433	  * but we'll get the same index back here.
434	  */
435	 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
436					       (gl_state_index *)tokens);
437
438	 /* Add each of the unique swizzles of the element as a
439	  * parameter.  This'll end up matching the expected layout of
440	  * the array/matrix/structure we're trying to fill in.
441	  */
442	 int last_swiz = -1;
443	 for (unsigned int j = 0; j < 4; j++) {
444	    int swiz = GET_SWZ(element->swizzle, j);
445	    if (swiz == last_swiz)
446	       break;
447	    last_swiz = swiz;
448
449	    c->prog_data.param_convert[c->prog_data.nr_params] =
450	       PARAM_NO_CONVERT;
451	    this->param_index[c->prog_data.nr_params] = index;
452	    this->param_offset[c->prog_data.nr_params] = swiz;
453	    c->prog_data.nr_params++;
454	 }
455      }
456   }
457}
458
459fs_reg *
460fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
461{
462   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
463   fs_reg wpos = *reg;
464   fs_reg neg_y = this->pixel_y;
465   neg_y.negate = true;
466   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
467
468   /* gl_FragCoord.x */
469   if (ir->pixel_center_integer) {
470      emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
471   } else {
472      emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
473   }
474   wpos.reg_offset++;
475
476   /* gl_FragCoord.y */
477   if (!flip && ir->pixel_center_integer) {
478      emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
479   } else {
480      fs_reg pixel_y = this->pixel_y;
481      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
482
483      if (flip) {
484	 pixel_y.negate = true;
485	 offset += c->key.drawable_height - 1.0;
486      }
487
488      emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
489   }
490   wpos.reg_offset++;
491
492   /* gl_FragCoord.z */
493   if (intel->gen >= 6) {
494      emit(BRW_OPCODE_MOV, wpos,
495	   fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
496   } else {
497      emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
498	   interp_reg(FRAG_ATTRIB_WPOS, 2));
499   }
500   wpos.reg_offset++;
501
502   /* gl_FragCoord.w: Already set up in emit_interpolation */
503   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
504
505   return reg;
506}
507
508fs_reg *
509fs_visitor::emit_general_interpolation(ir_variable *ir)
510{
511   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
512   /* Interpolation is always in floating point regs. */
513   reg->type = BRW_REGISTER_TYPE_F;
514   fs_reg attr = *reg;
515
516   unsigned int array_elements;
517   const glsl_type *type;
518
519   if (ir->type->is_array()) {
520      array_elements = ir->type->length;
521      if (array_elements == 0) {
522	 fail("dereferenced array '%s' has length 0\n", ir->name);
523      }
524      type = ir->type->fields.array;
525   } else {
526      array_elements = 1;
527      type = ir->type;
528   }
529
530   int location = ir->location;
531   for (unsigned int i = 0; i < array_elements; i++) {
532      for (unsigned int j = 0; j < type->matrix_columns; j++) {
533	 if (urb_setup[location] == -1) {
534	    /* If there's no incoming setup data for this slot, don't
535	     * emit interpolation for it.
536	     */
537	    attr.reg_offset += type->vector_elements;
538	    location++;
539	    continue;
540	 }
541
542	 bool is_gl_Color =
543	    location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
544
545	 if (c->key.flat_shade && is_gl_Color) {
546	    /* Constant interpolation (flat shading) case. The SF has
547	     * handed us defined values in only the constant offset
548	     * field of the setup reg.
549	     */
550	    for (unsigned int k = 0; k < type->vector_elements; k++) {
551	       struct brw_reg interp = interp_reg(location, k);
552	       interp = suboffset(interp, 3);
553	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
554	       attr.reg_offset++;
555	    }
556	 } else {
557	    /* Perspective interpolation case. */
558	    for (unsigned int k = 0; k < type->vector_elements; k++) {
559	       struct brw_reg interp = interp_reg(location, k);
560	       emit(FS_OPCODE_LINTERP, attr,
561		    this->delta_x, this->delta_y, fs_reg(interp));
562	       attr.reg_offset++;
563	    }
564
565	    if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) {
566	       attr.reg_offset -= type->vector_elements;
567	       for (unsigned int k = 0; k < type->vector_elements; k++) {
568		  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
569		  attr.reg_offset++;
570	       }
571	    }
572	 }
573	 location++;
574      }
575   }
576
577   return reg;
578}
579
580fs_reg *
581fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
582{
583   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
584
585   /* The frontfacing comes in as a bit in the thread payload. */
586   if (intel->gen >= 6) {
587      emit(BRW_OPCODE_ASR, *reg,
588	   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
589	   fs_reg(15));
590      emit(BRW_OPCODE_NOT, *reg, *reg);
591      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
592   } else {
593      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
594      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
595       * us front face
596       */
597      fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
598			   fs_reg(r1_6ud),
599			   fs_reg(1u << 31));
600      inst->conditional_mod = BRW_CONDITIONAL_L;
601      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
602   }
603
604   return reg;
605}
606
607fs_inst *
608fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
609{
610   switch (opcode) {
611   case FS_OPCODE_RCP:
612   case FS_OPCODE_RSQ:
613   case FS_OPCODE_SQRT:
614   case FS_OPCODE_EXP2:
615   case FS_OPCODE_LOG2:
616   case FS_OPCODE_SIN:
617   case FS_OPCODE_COS:
618      break;
619   default:
620      assert(!"not reached: bad math opcode");
621      return NULL;
622   }
623
624   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
625    * might be able to do better by doing execsize = 1 math and then
626    * expanding that result out, but we would need to be careful with
627    * masking.
628    *
629    * The hardware ignores source modifiers (negate and abs) on math
630    * instructions, so we also move to a temp to set those up.
631    */
632   if (intel->gen >= 6 && (src.file == UNIFORM ||
633			   src.abs ||
634			   src.negate)) {
635      fs_reg expanded = fs_reg(this, glsl_type::float_type);
636      emit(BRW_OPCODE_MOV, expanded, src);
637      src = expanded;
638   }
639
640   fs_inst *inst = emit(opcode, dst, src);
641
642   if (intel->gen < 6) {
643      inst->base_mrf = 2;
644      inst->mlen = 1;
645   }
646
647   return inst;
648}
649
650fs_inst *
651fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
652{
653   int base_mrf = 2;
654   fs_inst *inst;
655
656   assert(opcode == FS_OPCODE_POW);
657
658   if (intel->gen >= 6) {
659      /* Can't do hstride == 0 args to gen6 math, so expand it out.
660       *
661       * The hardware ignores source modifiers (negate and abs) on math
662       * instructions, so we also move to a temp to set those up.
663       */
664      if (src0.file == UNIFORM || src0.abs || src0.negate) {
665	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
666	 emit(BRW_OPCODE_MOV, expanded, src0);
667	 src0 = expanded;
668      }
669
670      if (src1.file == UNIFORM || src1.abs || src1.negate) {
671	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
672	 emit(BRW_OPCODE_MOV, expanded, src1);
673	 src1 = expanded;
674      }
675
676      inst = emit(opcode, dst, src0, src1);
677   } else {
678      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
679      inst = emit(opcode, dst, src0, reg_null_f);
680
681      inst->base_mrf = base_mrf;
682      inst->mlen = 2;
683   }
684   return inst;
685}
686
687void
688fs_visitor::visit(ir_variable *ir)
689{
690   fs_reg *reg = NULL;
691
692   if (variable_storage(ir))
693      return;
694
695   if (strcmp(ir->name, "gl_FragColor") == 0) {
696      this->frag_color = ir;
697   } else if (strcmp(ir->name, "gl_FragData") == 0) {
698      this->frag_data = ir;
699   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
700      this->frag_depth = ir;
701   }
702
703   if (ir->mode == ir_var_in) {
704      if (!strcmp(ir->name, "gl_FragCoord")) {
705	 reg = emit_fragcoord_interpolation(ir);
706      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
707	 reg = emit_frontfacing_interpolation(ir);
708      } else {
709	 reg = emit_general_interpolation(ir);
710      }
711      assert(reg);
712      hash_table_insert(this->variable_ht, reg, ir);
713      return;
714   }
715
716   if (ir->mode == ir_var_uniform) {
717      int param_index = c->prog_data.nr_params;
718
719      if (!strncmp(ir->name, "gl_", 3)) {
720	 setup_builtin_uniform_values(ir);
721      } else {
722	 setup_uniform_values(ir->location, ir->type);
723      }
724
725      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
726      reg->type = brw_type_for_base_type(ir->type);
727   }
728
729   if (!reg)
730      reg = new(this->mem_ctx) fs_reg(this, ir->type);
731
732   hash_table_insert(this->variable_ht, reg, ir);
733}
734
735void
736fs_visitor::visit(ir_dereference_variable *ir)
737{
738   fs_reg *reg = variable_storage(ir->var);
739   this->result = *reg;
740}
741
742void
743fs_visitor::visit(ir_dereference_record *ir)
744{
745   const glsl_type *struct_type = ir->record->type;
746
747   ir->record->accept(this);
748
749   unsigned int offset = 0;
750   for (unsigned int i = 0; i < struct_type->length; i++) {
751      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
752	 break;
753      offset += type_size(struct_type->fields.structure[i].type);
754   }
755   this->result.reg_offset += offset;
756   this->result.type = brw_type_for_base_type(ir->type);
757}
758
759void
760fs_visitor::visit(ir_dereference_array *ir)
761{
762   ir_constant *index;
763   int element_size;
764
765   ir->array->accept(this);
766   index = ir->array_index->as_constant();
767
768   element_size = type_size(ir->type);
769   this->result.type = brw_type_for_base_type(ir->type);
770
771   if (index) {
772      assert(this->result.file == UNIFORM ||
773	     (this->result.file == GRF &&
774	      this->result.reg != 0));
775      this->result.reg_offset += index->value.i[0] * element_size;
776   } else {
777      assert(!"FINISHME: non-constant array element");
778   }
779}
780
781/* Instruction selection: Produce a MOV.sat instead of
782 * MIN(MAX(val, 0), 1) when possible.
783 */
784bool
785fs_visitor::try_emit_saturate(ir_expression *ir)
786{
787   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
788
789   if (!sat_val)
790      return false;
791
792   sat_val->accept(this);
793   fs_reg src = this->result;
794
795   this->result = fs_reg(this, ir->type);
796   fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
797   inst->saturate = true;
798
799   return true;
800}
801
802static uint32_t
803brw_conditional_for_comparison(unsigned int op)
804{
805   switch (op) {
806   case ir_binop_less:
807      return BRW_CONDITIONAL_L;
808   case ir_binop_greater:
809      return BRW_CONDITIONAL_G;
810   case ir_binop_lequal:
811      return BRW_CONDITIONAL_LE;
812   case ir_binop_gequal:
813      return BRW_CONDITIONAL_GE;
814   case ir_binop_equal:
815   case ir_binop_all_equal: /* same as equal for scalars */
816      return BRW_CONDITIONAL_Z;
817   case ir_binop_nequal:
818   case ir_binop_any_nequal: /* same as nequal for scalars */
819      return BRW_CONDITIONAL_NZ;
820   default:
821      assert(!"not reached: bad operation for comparison");
822      return BRW_CONDITIONAL_NZ;
823   }
824}
825
826void
827fs_visitor::visit(ir_expression *ir)
828{
829   unsigned int operand;
830   fs_reg op[2], temp;
831   fs_inst *inst;
832
833   assert(ir->get_num_operands() <= 2);
834
835   if (try_emit_saturate(ir))
836      return;
837
838   for (operand = 0; operand < ir->get_num_operands(); operand++) {
839      ir->operands[operand]->accept(this);
840      if (this->result.file == BAD_FILE) {
841	 ir_print_visitor v;
842	 fail("Failed to get tree for expression operand:\n");
843	 ir->operands[operand]->accept(&v);
844      }
845      op[operand] = this->result;
846
847      /* Matrix expression operands should have been broken down to vector
848       * operations already.
849       */
850      assert(!ir->operands[operand]->type->is_matrix());
851      /* And then those vector operands should have been broken down to scalar.
852       */
853      assert(!ir->operands[operand]->type->is_vector());
854   }
855
856   /* Storage for our result.  If our result goes into an assignment, it will
857    * just get copy-propagated out, so no worries.
858    */
859   this->result = fs_reg(this, ir->type);
860
861   switch (ir->operation) {
862   case ir_unop_logic_not:
863      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
864       * ones complement of the whole register, not just bit 0.
865       */
866      emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
867      break;
868   case ir_unop_neg:
869      op[0].negate = !op[0].negate;
870      this->result = op[0];
871      break;
872   case ir_unop_abs:
873      op[0].abs = true;
874      op[0].negate = false;
875      this->result = op[0];
876      break;
877   case ir_unop_sign:
878      temp = fs_reg(this, ir->type);
879
880      emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
881
882      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
883      inst->conditional_mod = BRW_CONDITIONAL_G;
884      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
885      inst->predicated = true;
886
887      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
888      inst->conditional_mod = BRW_CONDITIONAL_L;
889      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
890      inst->predicated = true;
891
892      break;
893   case ir_unop_rcp:
894      emit_math(FS_OPCODE_RCP, this->result, op[0]);
895      break;
896
897   case ir_unop_exp2:
898      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
899      break;
900   case ir_unop_log2:
901      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
902      break;
903   case ir_unop_exp:
904   case ir_unop_log:
905      assert(!"not reached: should be handled by ir_explog_to_explog2");
906      break;
907   case ir_unop_sin:
908   case ir_unop_sin_reduced:
909      emit_math(FS_OPCODE_SIN, this->result, op[0]);
910      break;
911   case ir_unop_cos:
912   case ir_unop_cos_reduced:
913      emit_math(FS_OPCODE_COS, this->result, op[0]);
914      break;
915
916   case ir_unop_dFdx:
917      emit(FS_OPCODE_DDX, this->result, op[0]);
918      break;
919   case ir_unop_dFdy:
920      emit(FS_OPCODE_DDY, this->result, op[0]);
921      break;
922
923   case ir_binop_add:
924      emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
925      break;
926   case ir_binop_sub:
927      assert(!"not reached: should be handled by ir_sub_to_add_neg");
928      break;
929
930   case ir_binop_mul:
931      emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
932      break;
933   case ir_binop_div:
934      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
935      break;
936   case ir_binop_mod:
937      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
938      break;
939
940   case ir_binop_less:
941   case ir_binop_greater:
942   case ir_binop_lequal:
943   case ir_binop_gequal:
944   case ir_binop_equal:
945   case ir_binop_all_equal:
946   case ir_binop_nequal:
947   case ir_binop_any_nequal:
948      temp = this->result;
949      /* original gen4 does implicit conversion before comparison. */
950      if (intel->gen < 5)
951	 temp.type = op[0].type;
952
953      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
954      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
955      emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
956      break;
957
958   case ir_binop_logic_xor:
959      emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
960      break;
961
962   case ir_binop_logic_or:
963      emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
964      break;
965
966   case ir_binop_logic_and:
967      emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
968      break;
969
970   case ir_binop_dot:
971   case ir_unop_any:
972      assert(!"not reached: should be handled by brw_fs_channel_expressions");
973      break;
974
975   case ir_unop_noise:
976      assert(!"not reached: should be handled by lower_noise");
977      break;
978
979   case ir_quadop_vector:
980      assert(!"not reached: should be handled by lower_quadop_vector");
981      break;
982
983   case ir_unop_sqrt:
984      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
985      break;
986
987   case ir_unop_rsq:
988      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
989      break;
990
991   case ir_unop_i2f:
992   case ir_unop_b2f:
993   case ir_unop_b2i:
994   case ir_unop_f2i:
995      emit(BRW_OPCODE_MOV, this->result, op[0]);
996      break;
997   case ir_unop_f2b:
998   case ir_unop_i2b:
999      temp = this->result;
1000      /* original gen4 does implicit conversion before comparison. */
1001      if (intel->gen < 5)
1002	 temp.type = op[0].type;
1003
1004      inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
1005      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1006      inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
1007      break;
1008
1009   case ir_unop_trunc:
1010      emit(BRW_OPCODE_RNDZ, this->result, op[0]);
1011      break;
1012   case ir_unop_ceil:
1013      op[0].negate = !op[0].negate;
1014      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1015      this->result.negate = true;
1016      break;
1017   case ir_unop_floor:
1018      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1019      break;
1020   case ir_unop_fract:
1021      inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
1022      break;
1023   case ir_unop_round_even:
1024      emit(BRW_OPCODE_RNDE, this->result, op[0]);
1025      break;
1026
1027   case ir_binop_min:
1028      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1029      inst->conditional_mod = BRW_CONDITIONAL_L;
1030
1031      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1032      inst->predicated = true;
1033      break;
1034   case ir_binop_max:
1035      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1036      inst->conditional_mod = BRW_CONDITIONAL_G;
1037
1038      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1039      inst->predicated = true;
1040      break;
1041
1042   case ir_binop_pow:
1043      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1044      break;
1045
1046   case ir_unop_bit_not:
1047      inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
1048      break;
1049   case ir_binop_bit_and:
1050      inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
1051      break;
1052   case ir_binop_bit_xor:
1053      inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
1054      break;
1055   case ir_binop_bit_or:
1056      inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
1057      break;
1058
1059   case ir_unop_u2f:
1060   case ir_binop_lshift:
1061   case ir_binop_rshift:
1062      assert(!"GLSL 1.30 features unsupported");
1063      break;
1064   }
1065}
1066
1067void
1068fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1069				   const glsl_type *type, bool predicated)
1070{
1071   switch (type->base_type) {
1072   case GLSL_TYPE_FLOAT:
1073   case GLSL_TYPE_UINT:
1074   case GLSL_TYPE_INT:
1075   case GLSL_TYPE_BOOL:
1076      for (unsigned int i = 0; i < type->components(); i++) {
1077	 l.type = brw_type_for_base_type(type);
1078	 r.type = brw_type_for_base_type(type);
1079
1080	 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
1081	 inst->predicated = predicated;
1082
1083	 l.reg_offset++;
1084	 r.reg_offset++;
1085      }
1086      break;
1087   case GLSL_TYPE_ARRAY:
1088      for (unsigned int i = 0; i < type->length; i++) {
1089	 emit_assignment_writes(l, r, type->fields.array, predicated);
1090      }
1091      break;
1092
1093   case GLSL_TYPE_STRUCT:
1094      for (unsigned int i = 0; i < type->length; i++) {
1095	 emit_assignment_writes(l, r, type->fields.structure[i].type,
1096				predicated);
1097      }
1098      break;
1099
1100   case GLSL_TYPE_SAMPLER:
1101      break;
1102
1103   default:
1104      assert(!"not reached");
1105      break;
1106   }
1107}
1108
1109void
1110fs_visitor::visit(ir_assignment *ir)
1111{
1112   struct fs_reg l, r;
1113   fs_inst *inst;
1114
1115   /* FINISHME: arrays on the lhs */
1116   ir->lhs->accept(this);
1117   l = this->result;
1118
1119   ir->rhs->accept(this);
1120   r = this->result;
1121
1122   assert(l.file != BAD_FILE);
1123   assert(r.file != BAD_FILE);
1124
1125   if (ir->condition) {
1126      emit_bool_to_cond_code(ir->condition);
1127   }
1128
1129   if (ir->lhs->type->is_scalar() ||
1130       ir->lhs->type->is_vector()) {
1131      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1132	 if (ir->write_mask & (1 << i)) {
1133	    inst = emit(BRW_OPCODE_MOV, l, r);
1134	    if (ir->condition)
1135	       inst->predicated = true;
1136	    r.reg_offset++;
1137	 }
1138	 l.reg_offset++;
1139      }
1140   } else {
1141      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1142   }
1143}
1144
1145fs_inst *
1146fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1147{
1148   int mlen;
1149   int base_mrf = 1;
1150   bool simd16 = false;
1151   fs_reg orig_dst;
1152
1153   /* g0 header. */
1154   mlen = 1;
1155
1156   if (ir->shadow_comparitor) {
1157      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1158	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1159	 coordinate.reg_offset++;
1160      }
1161      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1162      mlen += 3;
1163
1164      if (ir->op == ir_tex) {
1165	 /* There's no plain shadow compare message, so we use shadow
1166	  * compare with a bias of 0.0.
1167	  */
1168	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1169	 mlen++;
1170      } else if (ir->op == ir_txb) {
1171	 ir->lod_info.bias->accept(this);
1172	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1173	 mlen++;
1174      } else {
1175	 assert(ir->op == ir_txl);
1176	 ir->lod_info.lod->accept(this);
1177	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1178	 mlen++;
1179      }
1180
1181      ir->shadow_comparitor->accept(this);
1182      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1183      mlen++;
1184   } else if (ir->op == ir_tex) {
1185      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1186	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1187	 coordinate.reg_offset++;
1188      }
1189      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1190      mlen += 3;
1191   } else if (ir->op == ir_txd) {
1192      assert(!"TXD isn't supported on gen4 yet.");
1193   } else {
1194      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1195       * instructions.  We'll need to do SIMD16 here.
1196       */
1197      assert(ir->op == ir_txb || ir->op == ir_txl);
1198
1199      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1200	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate);
1201	 coordinate.reg_offset++;
1202      }
1203
1204      /* lod/bias appears after u/v/r. */
1205      mlen += 6;
1206
1207      if (ir->op == ir_txb) {
1208	 ir->lod_info.bias->accept(this);
1209	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1210	 mlen++;
1211      } else {
1212	 ir->lod_info.lod->accept(this);
1213	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1214	 mlen++;
1215      }
1216
1217      /* The unused upper half. */
1218      mlen++;
1219
1220      /* Now, since we're doing simd16, the return is 2 interleaved
1221       * vec4s where the odd-indexed ones are junk. We'll need to move
1222       * this weirdness around to the expected layout.
1223       */
1224      simd16 = true;
1225      orig_dst = dst;
1226      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1227						       2));
1228      dst.type = BRW_REGISTER_TYPE_F;
1229   }
1230
1231   fs_inst *inst = NULL;
1232   switch (ir->op) {
1233   case ir_tex:
1234      inst = emit(FS_OPCODE_TEX, dst);
1235      break;
1236   case ir_txb:
1237      inst = emit(FS_OPCODE_TXB, dst);
1238      break;
1239   case ir_txl:
1240      inst = emit(FS_OPCODE_TXL, dst);
1241      break;
1242   case ir_txd:
1243      inst = emit(FS_OPCODE_TXD, dst);
1244      break;
1245   case ir_txf:
1246      assert(!"GLSL 1.30 features unsupported");
1247      break;
1248   }
1249   inst->base_mrf = base_mrf;
1250   inst->mlen = mlen;
1251
1252   if (simd16) {
1253      for (int i = 0; i < 4; i++) {
1254	 emit(BRW_OPCODE_MOV, orig_dst, dst);
1255	 orig_dst.reg_offset++;
1256	 dst.reg_offset += 2;
1257      }
1258   }
1259
1260   return inst;
1261}
1262
1263fs_inst *
1264fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1265{
1266   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1267    * optional parameters like shadow comparitor or LOD bias.  If
1268    * optional parameters aren't present, those base slots are
1269    * optional and don't need to be included in the message.
1270    *
1271    * We don't fill in the unnecessary slots regardless, which may
1272    * look surprising in the disassembly.
1273    */
1274   int mlen = 1; /* g0 header always present. */
1275   int base_mrf = 1;
1276
1277   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1278      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1279      coordinate.reg_offset++;
1280   }
1281   mlen += ir->coordinate->type->vector_elements;
1282
1283   if (ir->shadow_comparitor) {
1284      mlen = MAX2(mlen, 5);
1285
1286      ir->shadow_comparitor->accept(this);
1287      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1288      mlen++;
1289   }
1290
1291   fs_inst *inst = NULL;
1292   switch (ir->op) {
1293   case ir_tex:
1294      inst = emit(FS_OPCODE_TEX, dst);
1295      break;
1296   case ir_txb:
1297      ir->lod_info.bias->accept(this);
1298      mlen = MAX2(mlen, 5);
1299      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1300      mlen++;
1301
1302      inst = emit(FS_OPCODE_TXB, dst);
1303      break;
1304   case ir_txl:
1305      ir->lod_info.lod->accept(this);
1306      mlen = MAX2(mlen, 5);
1307      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1308      mlen++;
1309
1310      inst = emit(FS_OPCODE_TXL, dst);
1311      break;
1312   case ir_txd:
1313   case ir_txf:
1314      assert(!"GLSL 1.30 features unsupported");
1315      break;
1316   }
1317   inst->base_mrf = base_mrf;
1318   inst->mlen = mlen;
1319
1320   return inst;
1321}
1322
1323void
1324fs_visitor::visit(ir_texture *ir)
1325{
1326   int sampler;
1327   fs_inst *inst = NULL;
1328
1329   ir->coordinate->accept(this);
1330   fs_reg coordinate = this->result;
1331
1332   if (ir->offset != NULL) {
1333      ir_constant *offset = ir->offset->as_constant();
1334      assert(offset != NULL);
1335
1336      signed char offsets[3];
1337      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1338	 offsets[i] = (signed char) offset->value.i[i];
1339
1340      /* Combine all three offsets into a single unsigned dword:
1341       *
1342       *    bits 11:8 - U Offset (X component)
1343       *    bits  7:4 - V Offset (Y component)
1344       *    bits  3:0 - R Offset (Z component)
1345       */
1346      unsigned offset_bits = 0;
1347      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1348	 const unsigned shift = 4 * (2 - i);
1349	 offset_bits |= (offsets[i] << shift) & (0xF << shift);
1350      }
1351
1352      /* Explicitly set up the message header by copying g0 to msg reg m1. */
1353      emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1354	   fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1355
1356      /* Then set the offset bits in DWord 2 of the message header. */
1357      emit(BRW_OPCODE_MOV,
1358	   fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1359			 BRW_REGISTER_TYPE_UD)),
1360	   fs_reg(brw_imm_uw(offset_bits)));
1361   }
1362
1363   /* Should be lowered by do_lower_texture_projection */
1364   assert(!ir->projector);
1365
1366   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1367					     ctx->Shader.CurrentFragmentProgram,
1368					     &brw->fragment_program->Base);
1369   sampler = c->fp->program.Base.SamplerUnits[sampler];
1370
1371   /* The 965 requires the EU to do the normalization of GL rectangle
1372    * texture coordinates.  We use the program parameter state
1373    * tracking to get the scaling factor.
1374    */
1375   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1376      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1377      int tokens[STATE_LENGTH] = {
1378	 STATE_INTERNAL,
1379	 STATE_TEXRECT_SCALE,
1380	 sampler,
1381	 0,
1382	 0
1383      };
1384
1385      c->prog_data.param_convert[c->prog_data.nr_params] =
1386	 PARAM_NO_CONVERT;
1387      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1388	 PARAM_NO_CONVERT;
1389
1390      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1391      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1392      GLuint index = _mesa_add_state_reference(params,
1393					       (gl_state_index *)tokens);
1394
1395      this->param_index[c->prog_data.nr_params] = index;
1396      this->param_offset[c->prog_data.nr_params] = 0;
1397      c->prog_data.nr_params++;
1398      this->param_index[c->prog_data.nr_params] = index;
1399      this->param_offset[c->prog_data.nr_params] = 1;
1400      c->prog_data.nr_params++;
1401
1402      fs_reg dst = fs_reg(this, ir->coordinate->type);
1403      fs_reg src = coordinate;
1404      coordinate = dst;
1405
1406      emit(BRW_OPCODE_MUL, dst, src, scale_x);
1407      dst.reg_offset++;
1408      src.reg_offset++;
1409      emit(BRW_OPCODE_MUL, dst, src, scale_y);
1410   }
1411
1412   /* Writemasking doesn't eliminate channels on SIMD8 texture
1413    * samples, so don't worry about them.
1414    */
1415   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1416
1417   if (intel->gen < 5) {
1418      inst = emit_texture_gen4(ir, dst, coordinate);
1419   } else {
1420      inst = emit_texture_gen5(ir, dst, coordinate);
1421   }
1422
1423   /* If there's an offset, we already set up m1.  To avoid the implied move,
1424    * use the null register.  Otherwise, we want an implied move from g0.
1425    */
1426   if (ir->offset != NULL)
1427      inst->src[0] = fs_reg(brw_null_reg());
1428   else
1429      inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1430
1431   inst->sampler = sampler;
1432
1433   this->result = dst;
1434
1435   if (ir->shadow_comparitor)
1436      inst->shadow_compare = true;
1437
1438   if (ir->type == glsl_type::float_type) {
1439      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1440      assert(ir->sampler->type->sampler_shadow);
1441   } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1442      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1443
1444      for (int i = 0; i < 4; i++) {
1445	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1446	 fs_reg l = swizzle_dst;
1447	 l.reg_offset += i;
1448
1449	 if (swiz == SWIZZLE_ZERO) {
1450	    emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1451	 } else if (swiz == SWIZZLE_ONE) {
1452	    emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1453	 } else {
1454	    fs_reg r = dst;
1455	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1456	    emit(BRW_OPCODE_MOV, l, r);
1457	 }
1458      }
1459      this->result = swizzle_dst;
1460   }
1461}
1462
1463void
1464fs_visitor::visit(ir_swizzle *ir)
1465{
1466   ir->val->accept(this);
1467   fs_reg val = this->result;
1468
1469   if (ir->type->vector_elements == 1) {
1470      this->result.reg_offset += ir->mask.x;
1471      return;
1472   }
1473
1474   fs_reg result = fs_reg(this, ir->type);
1475   this->result = result;
1476
1477   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1478      fs_reg channel = val;
1479      int swiz = 0;
1480
1481      switch (i) {
1482      case 0:
1483	 swiz = ir->mask.x;
1484	 break;
1485      case 1:
1486	 swiz = ir->mask.y;
1487	 break;
1488      case 2:
1489	 swiz = ir->mask.z;
1490	 break;
1491      case 3:
1492	 swiz = ir->mask.w;
1493	 break;
1494      }
1495
1496      channel.reg_offset += swiz;
1497      emit(BRW_OPCODE_MOV, result, channel);
1498      result.reg_offset++;
1499   }
1500}
1501
1502void
1503fs_visitor::visit(ir_discard *ir)
1504{
1505   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1506
1507   assert(ir->condition == NULL); /* FINISHME */
1508
1509   emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d);
1510   emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp);
1511   kill_emitted = true;
1512}
1513
1514void
1515fs_visitor::visit(ir_constant *ir)
1516{
1517   /* Set this->result to reg at the bottom of the function because some code
1518    * paths will cause this visitor to be applied to other fields.  This will
1519    * cause the value stored in this->result to be modified.
1520    *
1521    * Make reg constant so that it doesn't get accidentally modified along the
1522    * way.  Yes, I actually had this problem. :(
1523    */
1524   const fs_reg reg(this, ir->type);
1525   fs_reg dst_reg = reg;
1526
1527   if (ir->type->is_array()) {
1528      const unsigned size = type_size(ir->type->fields.array);
1529
1530      for (unsigned i = 0; i < ir->type->length; i++) {
1531	 ir->array_elements[i]->accept(this);
1532	 fs_reg src_reg = this->result;
1533
1534	 dst_reg.type = src_reg.type;
1535	 for (unsigned j = 0; j < size; j++) {
1536	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1537	    src_reg.reg_offset++;
1538	    dst_reg.reg_offset++;
1539	 }
1540      }
1541   } else if (ir->type->is_record()) {
1542      foreach_list(node, &ir->components) {
1543	 ir_instruction *const field = (ir_instruction *) node;
1544	 const unsigned size = type_size(field->type);
1545
1546	 field->accept(this);
1547	 fs_reg src_reg = this->result;
1548
1549	 dst_reg.type = src_reg.type;
1550	 for (unsigned j = 0; j < size; j++) {
1551	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1552	    src_reg.reg_offset++;
1553	    dst_reg.reg_offset++;
1554	 }
1555      }
1556   } else {
1557      const unsigned size = type_size(ir->type);
1558
1559      for (unsigned i = 0; i < size; i++) {
1560	 switch (ir->type->base_type) {
1561	 case GLSL_TYPE_FLOAT:
1562	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1563	    break;
1564	 case GLSL_TYPE_UINT:
1565	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1566	    break;
1567	 case GLSL_TYPE_INT:
1568	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1569	    break;
1570	 case GLSL_TYPE_BOOL:
1571	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1572	    break;
1573	 default:
1574	    assert(!"Non-float/uint/int/bool constant");
1575	 }
1576	 dst_reg.reg_offset++;
1577      }
1578   }
1579
1580   this->result = reg;
1581}
1582
1583void
1584fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1585{
1586   ir_expression *expr = ir->as_expression();
1587
1588   if (expr) {
1589      fs_reg op[2];
1590      fs_inst *inst;
1591
1592      assert(expr->get_num_operands() <= 2);
1593      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1594	 assert(expr->operands[i]->type->is_scalar());
1595
1596	 expr->operands[i]->accept(this);
1597	 op[i] = this->result;
1598      }
1599
1600      switch (expr->operation) {
1601      case ir_unop_logic_not:
1602	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1603	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1604	 break;
1605
1606      case ir_binop_logic_xor:
1607	 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1608	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1609	 break;
1610
1611      case ir_binop_logic_or:
1612	 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1613	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1614	 break;
1615
1616      case ir_binop_logic_and:
1617	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1618	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1619	 break;
1620
1621      case ir_unop_f2b:
1622	 if (intel->gen >= 6) {
1623	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1624	 } else {
1625	    inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1626	 }
1627	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1628	 break;
1629
1630      case ir_unop_i2b:
1631	 if (intel->gen >= 6) {
1632	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1633	 } else {
1634	    inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1635	 }
1636	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1637	 break;
1638
1639      case ir_binop_greater:
1640      case ir_binop_gequal:
1641      case ir_binop_less:
1642      case ir_binop_lequal:
1643      case ir_binop_equal:
1644      case ir_binop_all_equal:
1645      case ir_binop_nequal:
1646      case ir_binop_any_nequal:
1647	 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1648	 inst->conditional_mod =
1649	    brw_conditional_for_comparison(expr->operation);
1650	 break;
1651
1652      default:
1653	 assert(!"not reached");
1654	 fail("bad cond code\n");
1655	 break;
1656      }
1657      return;
1658   }
1659
1660   ir->accept(this);
1661
1662   if (intel->gen >= 6) {
1663      fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1664      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1665   } else {
1666      fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1667      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1668   }
1669}
1670
1671/**
1672 * Emit a gen6 IF statement with the comparison folded into the IF
1673 * instruction.
1674 */
1675void
1676fs_visitor::emit_if_gen6(ir_if *ir)
1677{
1678   ir_expression *expr = ir->condition->as_expression();
1679
1680   if (expr) {
1681      fs_reg op[2];
1682      fs_inst *inst;
1683      fs_reg temp;
1684
1685      assert(expr->get_num_operands() <= 2);
1686      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1687	 assert(expr->operands[i]->type->is_scalar());
1688
1689	 expr->operands[i]->accept(this);
1690	 op[i] = this->result;
1691      }
1692
1693      switch (expr->operation) {
1694      case ir_unop_logic_not:
1695	 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1696	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1697	 return;
1698
1699      case ir_binop_logic_xor:
1700	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1701	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1702	 return;
1703
1704      case ir_binop_logic_or:
1705	 temp = fs_reg(this, glsl_type::bool_type);
1706	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1707	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1708	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1709	 return;
1710
1711      case ir_binop_logic_and:
1712	 temp = fs_reg(this, glsl_type::bool_type);
1713	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1714	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1715	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1716	 return;
1717
1718      case ir_unop_f2b:
1719	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1720	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1721	 return;
1722
1723      case ir_unop_i2b:
1724	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1725	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1726	 return;
1727
1728      case ir_binop_greater:
1729      case ir_binop_gequal:
1730      case ir_binop_less:
1731      case ir_binop_lequal:
1732      case ir_binop_equal:
1733      case ir_binop_all_equal:
1734      case ir_binop_nequal:
1735      case ir_binop_any_nequal:
1736	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1737	 inst->conditional_mod =
1738	    brw_conditional_for_comparison(expr->operation);
1739	 return;
1740      default:
1741	 assert(!"not reached");
1742	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1743	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1744	 fail("bad condition\n");
1745	 return;
1746      }
1747      return;
1748   }
1749
1750   ir->condition->accept(this);
1751
1752   fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1753   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1754}
1755
1756void
1757fs_visitor::visit(ir_if *ir)
1758{
1759   fs_inst *inst;
1760
1761   /* Don't point the annotation at the if statement, because then it plus
1762    * the then and else blocks get printed.
1763    */
1764   this->base_ir = ir->condition;
1765
1766   if (intel->gen >= 6) {
1767      emit_if_gen6(ir);
1768   } else {
1769      emit_bool_to_cond_code(ir->condition);
1770
1771      inst = emit(BRW_OPCODE_IF);
1772      inst->predicated = true;
1773   }
1774
1775   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1776      ir_instruction *ir = (ir_instruction *)iter.get();
1777      this->base_ir = ir;
1778
1779      ir->accept(this);
1780   }
1781
1782   if (!ir->else_instructions.is_empty()) {
1783      emit(BRW_OPCODE_ELSE);
1784
1785      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1786	 ir_instruction *ir = (ir_instruction *)iter.get();
1787	 this->base_ir = ir;
1788
1789	 ir->accept(this);
1790      }
1791   }
1792
1793   emit(BRW_OPCODE_ENDIF);
1794}
1795
1796void
1797fs_visitor::visit(ir_loop *ir)
1798{
1799   fs_reg counter = reg_undef;
1800
1801   if (ir->counter) {
1802      this->base_ir = ir->counter;
1803      ir->counter->accept(this);
1804      counter = *(variable_storage(ir->counter));
1805
1806      if (ir->from) {
1807	 this->base_ir = ir->from;
1808	 ir->from->accept(this);
1809
1810	 emit(BRW_OPCODE_MOV, counter, this->result);
1811      }
1812   }
1813
1814   emit(BRW_OPCODE_DO);
1815
1816   if (ir->to) {
1817      this->base_ir = ir->to;
1818      ir->to->accept(this);
1819
1820      fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1821      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1822
1823      inst = emit(BRW_OPCODE_BREAK);
1824      inst->predicated = true;
1825   }
1826
1827   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1828      ir_instruction *ir = (ir_instruction *)iter.get();
1829
1830      this->base_ir = ir;
1831      ir->accept(this);
1832   }
1833
1834   if (ir->increment) {
1835      this->base_ir = ir->increment;
1836      ir->increment->accept(this);
1837      emit(BRW_OPCODE_ADD, counter, counter, this->result);
1838   }
1839
1840   emit(BRW_OPCODE_WHILE);
1841}
1842
1843void
1844fs_visitor::visit(ir_loop_jump *ir)
1845{
1846   switch (ir->mode) {
1847   case ir_loop_jump::jump_break:
1848      emit(BRW_OPCODE_BREAK);
1849      break;
1850   case ir_loop_jump::jump_continue:
1851      emit(BRW_OPCODE_CONTINUE);
1852      break;
1853   }
1854}
1855
1856void
1857fs_visitor::visit(ir_call *ir)
1858{
1859   assert(!"FINISHME");
1860}
1861
1862void
1863fs_visitor::visit(ir_return *ir)
1864{
1865   assert(!"FINISHME");
1866}
1867
1868void
1869fs_visitor::visit(ir_function *ir)
1870{
1871   /* Ignore function bodies other than main() -- we shouldn't see calls to
1872    * them since they should all be inlined before we get to ir_to_mesa.
1873    */
1874   if (strcmp(ir->name, "main") == 0) {
1875      const ir_function_signature *sig;
1876      exec_list empty;
1877
1878      sig = ir->matching_signature(&empty);
1879
1880      assert(sig);
1881
1882      foreach_iter(exec_list_iterator, iter, sig->body) {
1883	 ir_instruction *ir = (ir_instruction *)iter.get();
1884	 this->base_ir = ir;
1885
1886	 ir->accept(this);
1887      }
1888   }
1889}
1890
1891void
1892fs_visitor::visit(ir_function_signature *ir)
1893{
1894   assert(!"not reached");
1895   (void)ir;
1896}
1897
1898fs_inst *
1899fs_visitor::emit(fs_inst inst)
1900{
1901   fs_inst *list_inst = new(mem_ctx) fs_inst;
1902   *list_inst = inst;
1903
1904   list_inst->annotation = this->current_annotation;
1905   list_inst->ir = this->base_ir;
1906
1907   this->instructions.push_tail(list_inst);
1908
1909   return list_inst;
1910}
1911
1912/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1913void
1914fs_visitor::emit_dummy_fs()
1915{
1916   /* Everyone's favorite color. */
1917   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
1918   emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
1919   emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
1920   emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
1921
1922   fs_inst *write;
1923   write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1924   write->base_mrf = 0;
1925}
1926
1927/* The register location here is relative to the start of the URB
1928 * data.  It will get adjusted to be a real location before
1929 * generate_code() time.
1930 */
1931struct brw_reg
1932fs_visitor::interp_reg(int location, int channel)
1933{
1934   int regnr = urb_setup[location] * 2 + channel / 2;
1935   int stride = (channel & 1) * 4;
1936
1937   assert(urb_setup[location] != -1);
1938
1939   return brw_vec1_grf(regnr, stride);
1940}
1941
1942/** Emits the interpolation for the varying inputs. */
1943void
1944fs_visitor::emit_interpolation_setup_gen4()
1945{
1946   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1947
1948   this->current_annotation = "compute pixel centers";
1949   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1950   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1951   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1952   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1953   emit(BRW_OPCODE_ADD,
1954	this->pixel_x,
1955	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1956	fs_reg(brw_imm_v(0x10101010)));
1957   emit(BRW_OPCODE_ADD,
1958	this->pixel_y,
1959	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1960	fs_reg(brw_imm_v(0x11001100)));
1961
1962   this->current_annotation = "compute pixel deltas from v0";
1963   if (brw->has_pln) {
1964      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1965      this->delta_y = this->delta_x;
1966      this->delta_y.reg_offset++;
1967   } else {
1968      this->delta_x = fs_reg(this, glsl_type::float_type);
1969      this->delta_y = fs_reg(this, glsl_type::float_type);
1970   }
1971   emit(BRW_OPCODE_ADD, this->delta_x,
1972	this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1973   emit(BRW_OPCODE_ADD, this->delta_y,
1974	this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1975
1976   this->current_annotation = "compute pos.w and 1/pos.w";
1977   /* Compute wpos.w.  It's always in our setup, since it's needed to
1978    * interpolate the other attributes.
1979    */
1980   this->wpos_w = fs_reg(this, glsl_type::float_type);
1981   emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1982	interp_reg(FRAG_ATTRIB_WPOS, 3));
1983   /* Compute the pixel 1/W value from wpos.w. */
1984   this->pixel_w = fs_reg(this, glsl_type::float_type);
1985   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1986   this->current_annotation = NULL;
1987}
1988
1989/** Emits the interpolation for the varying inputs. */
1990void
1991fs_visitor::emit_interpolation_setup_gen6()
1992{
1993   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1994
1995   /* If the pixel centers end up used, the setup is the same as for gen4. */
1996   this->current_annotation = "compute pixel centers";
1997   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1998   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1999   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2000   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2001   emit(BRW_OPCODE_ADD,
2002	int_pixel_x,
2003	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2004	fs_reg(brw_imm_v(0x10101010)));
2005   emit(BRW_OPCODE_ADD,
2006	int_pixel_y,
2007	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2008	fs_reg(brw_imm_v(0x11001100)));
2009
2010   /* As of gen6, we can no longer mix float and int sources.  We have
2011    * to turn the integer pixel centers into floats for their actual
2012    * use.
2013    */
2014   this->pixel_x = fs_reg(this, glsl_type::float_type);
2015   this->pixel_y = fs_reg(this, glsl_type::float_type);
2016   emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
2017   emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
2018
2019   this->current_annotation = "compute 1/pos.w";
2020   this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2021   this->pixel_w = fs_reg(this, glsl_type::float_type);
2022   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2023
2024   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2025   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2026
2027   this->current_annotation = NULL;
2028}
2029
2030void
2031fs_visitor::emit_fb_writes()
2032{
2033   this->current_annotation = "FB write header";
2034   GLboolean header_present = GL_TRUE;
2035   int nr = 0;
2036
2037   if (intel->gen >= 6 &&
2038       !this->kill_emitted &&
2039       c->key.nr_color_regions == 1) {
2040      header_present = false;
2041   }
2042
2043   if (header_present) {
2044      /* m0, m1 header */
2045      nr += 2;
2046   }
2047
2048   if (c->aa_dest_stencil_reg) {
2049      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2050	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2051   }
2052
2053   /* Reserve space for color. It'll be filled in per MRT below. */
2054   int color_mrf = nr;
2055   nr += 4;
2056
2057   if (c->source_depth_to_render_target) {
2058      if (c->computes_depth) {
2059	 /* Hand over gl_FragDepth. */
2060	 assert(this->frag_depth);
2061	 fs_reg depth = *(variable_storage(this->frag_depth));
2062
2063	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
2064      } else {
2065	 /* Pass through the payload depth. */
2066	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2067	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2068      }
2069   }
2070
2071   if (c->dest_depth_reg) {
2072      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2073	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2074   }
2075
2076   fs_reg color = reg_undef;
2077   if (this->frag_color)
2078      color = *(variable_storage(this->frag_color));
2079   else if (this->frag_data) {
2080      color = *(variable_storage(this->frag_data));
2081      color.type = BRW_REGISTER_TYPE_F;
2082   }
2083
2084   for (int target = 0; target < c->key.nr_color_regions; target++) {
2085      this->current_annotation = ralloc_asprintf(this->mem_ctx,
2086						 "FB write target %d",
2087						 target);
2088      if (this->frag_color || this->frag_data) {
2089	 for (int i = 0; i < 4; i++) {
2090	    emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
2091	    color.reg_offset++;
2092	 }
2093      }
2094
2095      if (this->frag_color)
2096	 color.reg_offset -= 4;
2097
2098      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2099      inst->target = target;
2100      inst->base_mrf = 0;
2101      inst->mlen = nr;
2102      if (target == c->key.nr_color_regions - 1)
2103	 inst->eot = true;
2104      inst->header_present = header_present;
2105   }
2106
2107   if (c->key.nr_color_regions == 0) {
2108      if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2109	 /* If the alpha test is enabled but there's no color buffer,
2110	  * we still need to send alpha out the pipeline to our null
2111	  * renderbuffer.
2112	  */
2113	 color.reg_offset += 3;
2114	 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color);
2115      }
2116
2117      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2118      inst->base_mrf = 0;
2119      inst->mlen = nr;
2120      inst->eot = true;
2121      inst->header_present = header_present;
2122   }
2123
2124   this->current_annotation = NULL;
2125}
2126
2127void
2128fs_visitor::generate_fb_write(fs_inst *inst)
2129{
2130   GLboolean eot = inst->eot;
2131   struct brw_reg implied_header;
2132
2133   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2134    * move, here's g1.
2135    */
2136   brw_push_insn_state(p);
2137   brw_set_mask_control(p, BRW_MASK_DISABLE);
2138   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2139
2140   if (inst->header_present) {
2141      if (intel->gen >= 6) {
2142	 brw_MOV(p,
2143		 brw_message_reg(inst->base_mrf),
2144		 brw_vec8_grf(0, 0));
2145
2146	 if (inst->target > 0) {
2147	    /* Set the render target index for choosing BLEND_STATE. */
2148	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2149			      BRW_REGISTER_TYPE_UD),
2150		    brw_imm_ud(inst->target));
2151	 }
2152
2153	 /* Clear viewport index, render target array index. */
2154	 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2155			   BRW_REGISTER_TYPE_UD),
2156		 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2157		 brw_imm_ud(0xf7ff));
2158
2159	 implied_header = brw_null_reg();
2160      } else {
2161	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2162      }
2163
2164      brw_MOV(p,
2165	      brw_message_reg(inst->base_mrf + 1),
2166	      brw_vec8_grf(1, 0));
2167   } else {
2168      implied_header = brw_null_reg();
2169   }
2170
2171   brw_pop_insn_state(p);
2172
2173   brw_fb_WRITE(p,
2174		8, /* dispatch_width */
2175		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
2176		inst->base_mrf,
2177		implied_header,
2178		inst->target,
2179		inst->mlen,
2180		0,
2181		eot,
2182		inst->header_present);
2183}
2184
2185void
2186fs_visitor::generate_linterp(fs_inst *inst,
2187			     struct brw_reg dst, struct brw_reg *src)
2188{
2189   struct brw_reg delta_x = src[0];
2190   struct brw_reg delta_y = src[1];
2191   struct brw_reg interp = src[2];
2192
2193   if (brw->has_pln &&
2194       delta_y.nr == delta_x.nr + 1 &&
2195       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2196      brw_PLN(p, dst, interp, delta_x);
2197   } else {
2198      brw_LINE(p, brw_null_reg(), interp, delta_x);
2199      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2200   }
2201}
2202
2203void
2204fs_visitor::generate_math(fs_inst *inst,
2205			  struct brw_reg dst, struct brw_reg *src)
2206{
2207   int op;
2208
2209   switch (inst->opcode) {
2210   case FS_OPCODE_RCP:
2211      op = BRW_MATH_FUNCTION_INV;
2212      break;
2213   case FS_OPCODE_RSQ:
2214      op = BRW_MATH_FUNCTION_RSQ;
2215      break;
2216   case FS_OPCODE_SQRT:
2217      op = BRW_MATH_FUNCTION_SQRT;
2218      break;
2219   case FS_OPCODE_EXP2:
2220      op = BRW_MATH_FUNCTION_EXP;
2221      break;
2222   case FS_OPCODE_LOG2:
2223      op = BRW_MATH_FUNCTION_LOG;
2224      break;
2225   case FS_OPCODE_POW:
2226      op = BRW_MATH_FUNCTION_POW;
2227      break;
2228   case FS_OPCODE_SIN:
2229      op = BRW_MATH_FUNCTION_SIN;
2230      break;
2231   case FS_OPCODE_COS:
2232      op = BRW_MATH_FUNCTION_COS;
2233      break;
2234   default:
2235      assert(!"not reached: unknown math function");
2236      op = 0;
2237      break;
2238   }
2239
2240   if (intel->gen >= 6) {
2241      assert(inst->mlen == 0);
2242
2243      if (inst->opcode == FS_OPCODE_POW) {
2244	 brw_math2(p, dst, op, src[0], src[1]);
2245      } else {
2246	 brw_math(p, dst,
2247		  op,
2248		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2249		  BRW_MATH_SATURATE_NONE,
2250		  0, src[0],
2251		  BRW_MATH_DATA_VECTOR,
2252		  BRW_MATH_PRECISION_FULL);
2253      }
2254   } else {
2255      assert(inst->mlen >= 1);
2256
2257      brw_math(p, dst,
2258	       op,
2259	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2260	       BRW_MATH_SATURATE_NONE,
2261	       inst->base_mrf, src[0],
2262	       BRW_MATH_DATA_VECTOR,
2263	       BRW_MATH_PRECISION_FULL);
2264   }
2265}
2266
2267void
2268fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2269{
2270   int msg_type = -1;
2271   int rlen = 4;
2272   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2273
2274   if (intel->gen >= 5) {
2275      switch (inst->opcode) {
2276      case FS_OPCODE_TEX:
2277	 if (inst->shadow_compare) {
2278	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
2279	 } else {
2280	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
2281	 }
2282	 break;
2283      case FS_OPCODE_TXB:
2284	 if (inst->shadow_compare) {
2285	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
2286	 } else {
2287	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
2288	 }
2289	 break;
2290      case FS_OPCODE_TXL:
2291	 if (inst->shadow_compare) {
2292	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
2293	 } else {
2294	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
2295	 }
2296	 break;
2297      case FS_OPCODE_TXD:
2298	 assert(!"TXD isn't supported on gen5+ yet.");
2299	 break;
2300      }
2301   } else {
2302      switch (inst->opcode) {
2303      case FS_OPCODE_TEX:
2304	 /* Note that G45 and older determines shadow compare and dispatch width
2305	  * from message length for most messages.
2306	  */
2307	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2308	 if (inst->shadow_compare) {
2309	    assert(inst->mlen == 6);
2310	 } else {
2311	    assert(inst->mlen <= 4);
2312	 }
2313	 break;
2314      case FS_OPCODE_TXB:
2315	 if (inst->shadow_compare) {
2316	    assert(inst->mlen == 6);
2317	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
2318	 } else {
2319	    assert(inst->mlen == 9);
2320	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2321	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2322	 }
2323	 break;
2324      case FS_OPCODE_TXL:
2325	 if (inst->shadow_compare) {
2326	    assert(inst->mlen == 6);
2327	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
2328	 } else {
2329	    assert(inst->mlen == 9);
2330	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
2331	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2332	 }
2333	 break;
2334      case FS_OPCODE_TXD:
2335	 assert(!"TXD isn't supported on gen4 yet.");
2336	 break;
2337      }
2338   }
2339   assert(msg_type != -1);
2340
2341   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2342      rlen = 8;
2343      dst = vec16(dst);
2344   }
2345
2346   brw_SAMPLE(p,
2347	      retype(dst, BRW_REGISTER_TYPE_UW),
2348	      inst->base_mrf,
2349	      src,
2350              SURF_INDEX_TEXTURE(inst->sampler),
2351	      inst->sampler,
2352	      WRITEMASK_XYZW,
2353	      msg_type,
2354	      rlen,
2355	      inst->mlen,
2356	      0,
2357	      1,
2358	      simd_mode);
2359}
2360
2361
2362/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2363 * looking like:
2364 *
2365 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2366 *
2367 * and we're trying to produce:
2368 *
2369 *           DDX                     DDY
2370 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2371 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2372 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2373 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2374 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2375 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2376 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2377 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2378 *
2379 * and add another set of two more subspans if in 16-pixel dispatch mode.
2380 *
2381 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2382 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2383 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2384 * between each other.  We could probably do it like ddx and swizzle the right
2385 * order later, but bail for now and just produce
2386 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2387 */
2388void
2389fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2390{
2391   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2392				 BRW_REGISTER_TYPE_F,
2393				 BRW_VERTICAL_STRIDE_2,
2394				 BRW_WIDTH_2,
2395				 BRW_HORIZONTAL_STRIDE_0,
2396				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2397   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2398				 BRW_REGISTER_TYPE_F,
2399				 BRW_VERTICAL_STRIDE_2,
2400				 BRW_WIDTH_2,
2401				 BRW_HORIZONTAL_STRIDE_0,
2402				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2403   brw_ADD(p, dst, src0, negate(src1));
2404}
2405
2406void
2407fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2408{
2409   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2410				 BRW_REGISTER_TYPE_F,
2411				 BRW_VERTICAL_STRIDE_4,
2412				 BRW_WIDTH_4,
2413				 BRW_HORIZONTAL_STRIDE_0,
2414				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2415   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2416				 BRW_REGISTER_TYPE_F,
2417				 BRW_VERTICAL_STRIDE_4,
2418				 BRW_WIDTH_4,
2419				 BRW_HORIZONTAL_STRIDE_0,
2420				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2421   brw_ADD(p, dst, src0, negate(src1));
2422}
2423
2424void
2425fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2426{
2427   if (intel->gen >= 6) {
2428      /* Gen6 no longer has the mask reg for us to just read the
2429       * active channels from.  However, cmp updates just the channels
2430       * of the flag reg that are enabled, so we can get at the
2431       * channel enables that way.  In this step, make a reg of ones
2432       * we'll compare to.
2433       */
2434      brw_MOV(p, mask, brw_imm_ud(1));
2435   } else {
2436      brw_push_insn_state(p);
2437      brw_set_mask_control(p, BRW_MASK_DISABLE);
2438      brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2439      brw_pop_insn_state(p);
2440   }
2441}
2442
2443void
2444fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2445{
2446   if (intel->gen >= 6) {
2447      struct brw_reg f0 = brw_flag_reg();
2448      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2449
2450      brw_push_insn_state(p);
2451      brw_set_mask_control(p, BRW_MASK_DISABLE);
2452      brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2453      brw_pop_insn_state(p);
2454
2455      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2456	      BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2457      /* Undo CMP's whacking of predication*/
2458      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2459
2460      brw_push_insn_state(p);
2461      brw_set_mask_control(p, BRW_MASK_DISABLE);
2462      brw_AND(p, g1, f0, g1);
2463      brw_pop_insn_state(p);
2464   } else {
2465      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2466
2467      mask = brw_uw1_reg(mask.file, mask.nr, 0);
2468
2469      brw_push_insn_state(p);
2470      brw_set_mask_control(p, BRW_MASK_DISABLE);
2471      brw_AND(p, g0, mask, g0);
2472      brw_pop_insn_state(p);
2473   }
2474}
2475
2476void
2477fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2478{
2479   assert(inst->mlen != 0);
2480
2481   brw_MOV(p,
2482	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2483	   retype(src, BRW_REGISTER_TYPE_UD));
2484   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2485				 inst->offset);
2486}
2487
2488void
2489fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2490{
2491   assert(inst->mlen != 0);
2492
2493   /* Clear any post destination dependencies that would be ignored by
2494    * the block read.  See the B-Spec for pre-gen5 send instruction.
2495    *
2496    * This could use a better solution, since texture sampling and
2497    * math reads could potentially run into it as well -- anywhere
2498    * that we have a SEND with a destination that is a register that
2499    * was written but not read within the last N instructions (what's
2500    * N?  unsure).  This is rare because of dead code elimination, but
2501    * not impossible.
2502    */
2503   if (intel->gen == 4 && !intel->is_g4x)
2504      brw_MOV(p, brw_null_reg(), dst);
2505
2506   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2507				inst->offset);
2508
2509   if (intel->gen == 4 && !intel->is_g4x) {
2510      /* gen4 errata: destination from a send can't be used as a
2511       * destination until it's been read.  Just read it so we don't
2512       * have to worry.
2513       */
2514      brw_MOV(p, brw_null_reg(), dst);
2515   }
2516}
2517
2518
2519void
2520fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2521{
2522   assert(inst->mlen != 0);
2523
2524   /* Clear any post destination dependencies that would be ignored by
2525    * the block read.  See the B-Spec for pre-gen5 send instruction.
2526    *
2527    * This could use a better solution, since texture sampling and
2528    * math reads could potentially run into it as well -- anywhere
2529    * that we have a SEND with a destination that is a register that
2530    * was written but not read within the last N instructions (what's
2531    * N?  unsure).  This is rare because of dead code elimination, but
2532    * not impossible.
2533    */
2534   if (intel->gen == 4 && !intel->is_g4x)
2535      brw_MOV(p, brw_null_reg(), dst);
2536
2537   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2538			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2539
2540   if (intel->gen == 4 && !intel->is_g4x) {
2541      /* gen4 errata: destination from a send can't be used as a
2542       * destination until it's been read.  Just read it so we don't
2543       * have to worry.
2544       */
2545      brw_MOV(p, brw_null_reg(), dst);
2546   }
2547}
2548
2549/**
2550 * To be called after the last _mesa_add_state_reference() call, to
2551 * set up prog_data.param[] for assign_curb_setup() and
2552 * setup_pull_constants().
2553 */
2554void
2555fs_visitor::setup_paramvalues_refs()
2556{
2557   /* Set up the pointers to ParamValues now that that array is finalized. */
2558   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2559      c->prog_data.param[i] =
2560	 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2561	 this->param_offset[i];
2562   }
2563}
2564
2565void
2566fs_visitor::assign_curb_setup()
2567{
2568   c->prog_data.first_curbe_grf = c->nr_payload_regs;
2569   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2570
2571   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2572   foreach_iter(exec_list_iterator, iter, this->instructions) {
2573      fs_inst *inst = (fs_inst *)iter.get();
2574
2575      for (unsigned int i = 0; i < 3; i++) {
2576	 if (inst->src[i].file == UNIFORM) {
2577	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2578	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2579						  constant_nr / 8,
2580						  constant_nr % 8);
2581
2582	    inst->src[i].file = FIXED_HW_REG;
2583	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2584	 }
2585      }
2586   }
2587}
2588
2589void
2590fs_visitor::calculate_urb_setup()
2591{
2592   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2593      urb_setup[i] = -1;
2594   }
2595
2596   int urb_next = 0;
2597   /* Figure out where each of the incoming setup attributes lands. */
2598   if (intel->gen >= 6) {
2599      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2600	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2601	    urb_setup[i] = urb_next++;
2602	 }
2603      }
2604   } else {
2605      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2606      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2607	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2608	    int fp_index;
2609
2610	    if (i >= VERT_RESULT_VAR0)
2611	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2612	    else if (i <= VERT_RESULT_TEX7)
2613	       fp_index = i;
2614	    else
2615	       fp_index = -1;
2616
2617	    if (fp_index >= 0)
2618	       urb_setup[fp_index] = urb_next++;
2619	 }
2620      }
2621   }
2622
2623   /* Each attribute is 4 setup channels, each of which is half a reg. */
2624   c->prog_data.urb_read_length = urb_next * 2;
2625}
2626
2627void
2628fs_visitor::assign_urb_setup()
2629{
2630   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2631
2632   /* Offset all the urb_setup[] index by the actual position of the
2633    * setup regs, now that the location of the constants has been chosen.
2634    */
2635   foreach_iter(exec_list_iterator, iter, this->instructions) {
2636      fs_inst *inst = (fs_inst *)iter.get();
2637
2638      if (inst->opcode == FS_OPCODE_LINTERP) {
2639	 assert(inst->src[2].file == FIXED_HW_REG);
2640	 inst->src[2].fixed_hw_reg.nr += urb_start;
2641      }
2642
2643      if (inst->opcode == FS_OPCODE_CINTERP) {
2644	 assert(inst->src[0].file == FIXED_HW_REG);
2645	 inst->src[0].fixed_hw_reg.nr += urb_start;
2646      }
2647   }
2648
2649   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2650}
2651
2652/**
2653 * Split large virtual GRFs into separate components if we can.
2654 *
2655 * This is mostly duplicated with what brw_fs_vector_splitting does,
2656 * but that's really conservative because it's afraid of doing
2657 * splitting that doesn't result in real progress after the rest of
2658 * the optimization phases, which would cause infinite looping in
2659 * optimization.  We can do it once here, safely.  This also has the
2660 * opportunity to split interpolated values, or maybe even uniforms,
2661 * which we don't have at the IR level.
2662 *
2663 * We want to split, because virtual GRFs are what we register
2664 * allocate and spill (due to contiguousness requirements for some
2665 * instructions), and they're what we naturally generate in the
2666 * codegen process, but most virtual GRFs don't actually need to be
2667 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2668 * live intervals and better dead code elimination and coalescing.
2669 */
2670void
2671fs_visitor::split_virtual_grfs()
2672{
2673   int num_vars = this->virtual_grf_next;
2674   bool split_grf[num_vars];
2675   int new_virtual_grf[num_vars];
2676
2677   /* Try to split anything > 0 sized. */
2678   for (int i = 0; i < num_vars; i++) {
2679      if (this->virtual_grf_sizes[i] != 1)
2680	 split_grf[i] = true;
2681      else
2682	 split_grf[i] = false;
2683   }
2684
2685   if (brw->has_pln) {
2686      /* PLN opcodes rely on the delta_xy being contiguous. */
2687      split_grf[this->delta_x.reg] = false;
2688   }
2689
2690   foreach_iter(exec_list_iterator, iter, this->instructions) {
2691      fs_inst *inst = (fs_inst *)iter.get();
2692
2693      /* Texturing produces 4 contiguous registers, so no splitting. */
2694      if (inst->is_tex()) {
2695	 split_grf[inst->dst.reg] = false;
2696      }
2697   }
2698
2699   /* Allocate new space for split regs.  Note that the virtual
2700    * numbers will be contiguous.
2701    */
2702   for (int i = 0; i < num_vars; i++) {
2703      if (split_grf[i]) {
2704	 new_virtual_grf[i] = virtual_grf_alloc(1);
2705	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2706	    int reg = virtual_grf_alloc(1);
2707	    assert(reg == new_virtual_grf[i] + j - 1);
2708	    (void) reg;
2709	 }
2710	 this->virtual_grf_sizes[i] = 1;
2711      }
2712   }
2713
2714   foreach_iter(exec_list_iterator, iter, this->instructions) {
2715      fs_inst *inst = (fs_inst *)iter.get();
2716
2717      if (inst->dst.file == GRF &&
2718	  split_grf[inst->dst.reg] &&
2719	  inst->dst.reg_offset != 0) {
2720	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2721			  inst->dst.reg_offset - 1);
2722	 inst->dst.reg_offset = 0;
2723      }
2724      for (int i = 0; i < 3; i++) {
2725	 if (inst->src[i].file == GRF &&
2726	     split_grf[inst->src[i].reg] &&
2727	     inst->src[i].reg_offset != 0) {
2728	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2729				inst->src[i].reg_offset - 1);
2730	    inst->src[i].reg_offset = 0;
2731	 }
2732      }
2733   }
2734   this->live_intervals_valid = false;
2735}
2736
2737/**
2738 * Choose accesses from the UNIFORM file to demote to using the pull
2739 * constant buffer.
2740 *
2741 * We allow a fragment shader to have more than the specified minimum
2742 * maximum number of fragment shader uniform components (64).  If
2743 * there are too many of these, they'd fill up all of register space.
2744 * So, this will push some of them out to the pull constant buffer and
2745 * update the program to load them.
2746 */
2747void
2748fs_visitor::setup_pull_constants()
2749{
2750   /* Only allow 16 registers (128 uniform components) as push constants. */
2751   unsigned int max_uniform_components = 16 * 8;
2752   if (c->prog_data.nr_params <= max_uniform_components)
2753      return;
2754
2755   /* Just demote the end of the list.  We could probably do better
2756    * here, demoting things that are rarely used in the program first.
2757    */
2758   int pull_uniform_base = max_uniform_components;
2759   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2760
2761   foreach_iter(exec_list_iterator, iter, this->instructions) {
2762      fs_inst *inst = (fs_inst *)iter.get();
2763
2764      for (int i = 0; i < 3; i++) {
2765	 if (inst->src[i].file != UNIFORM)
2766	    continue;
2767
2768	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2769	 if (uniform_nr < pull_uniform_base)
2770	    continue;
2771
2772	 fs_reg dst = fs_reg(this, glsl_type::float_type);
2773	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2774					      dst);
2775	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2776	 pull->ir = inst->ir;
2777	 pull->annotation = inst->annotation;
2778	 pull->base_mrf = 14;
2779	 pull->mlen = 1;
2780
2781	 inst->insert_before(pull);
2782
2783	 inst->src[i].file = GRF;
2784	 inst->src[i].reg = dst.reg;
2785	 inst->src[i].reg_offset = 0;
2786	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2787      }
2788   }
2789
2790   for (int i = 0; i < pull_uniform_count; i++) {
2791      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2792      c->prog_data.pull_param_convert[i] =
2793	 c->prog_data.param_convert[pull_uniform_base + i];
2794   }
2795   c->prog_data.nr_params -= pull_uniform_count;
2796   c->prog_data.nr_pull_params = pull_uniform_count;
2797}
2798
2799void
2800fs_visitor::calculate_live_intervals()
2801{
2802   int num_vars = this->virtual_grf_next;
2803   int *def = ralloc_array(mem_ctx, int, num_vars);
2804   int *use = ralloc_array(mem_ctx, int, num_vars);
2805   int loop_depth = 0;
2806   int loop_start = 0;
2807   int bb_header_ip = 0;
2808
2809   if (this->live_intervals_valid)
2810      return;
2811
2812   for (int i = 0; i < num_vars; i++) {
2813      def[i] = MAX_INSTRUCTION;
2814      use[i] = -1;
2815   }
2816
2817   int ip = 0;
2818   foreach_iter(exec_list_iterator, iter, this->instructions) {
2819      fs_inst *inst = (fs_inst *)iter.get();
2820
2821      if (inst->opcode == BRW_OPCODE_DO) {
2822	 if (loop_depth++ == 0)
2823	    loop_start = ip;
2824      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2825	 loop_depth--;
2826
2827	 if (loop_depth == 0) {
2828	    /* Patches up the use of vars marked for being live across
2829	     * the whole loop.
2830	     */
2831	    for (int i = 0; i < num_vars; i++) {
2832	       if (use[i] == loop_start) {
2833		  use[i] = ip;
2834	       }
2835	    }
2836	 }
2837      } else {
2838	 for (unsigned int i = 0; i < 3; i++) {
2839	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2840	       int reg = inst->src[i].reg;
2841
2842	       if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2843				   def[reg] >= bb_header_ip)) {
2844		  use[reg] = ip;
2845	       } else {
2846		  def[reg] = MIN2(loop_start, def[reg]);
2847		  use[reg] = loop_start;
2848
2849		  /* Nobody else is going to go smash our start to
2850		   * later in the loop now, because def[reg] now
2851		   * points before the bb header.
2852		   */
2853	       }
2854	    }
2855	 }
2856	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2857	    int reg = inst->dst.reg;
2858
2859	    if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2860				!inst->predicated)) {
2861	       def[reg] = MIN2(def[reg], ip);
2862	    } else {
2863	       def[reg] = MIN2(def[reg], loop_start);
2864	    }
2865	 }
2866      }
2867
2868      ip++;
2869
2870      /* Set the basic block header IP.  This is used for determining
2871       * if a complete def of single-register virtual GRF in a loop
2872       * dominates a use in the same basic block.  It's a quick way to
2873       * reduce the live interval range of most register used in a
2874       * loop.
2875       */
2876      if (inst->opcode == BRW_OPCODE_IF ||
2877	  inst->opcode == BRW_OPCODE_ELSE ||
2878	  inst->opcode == BRW_OPCODE_ENDIF ||
2879	  inst->opcode == BRW_OPCODE_DO ||
2880	  inst->opcode == BRW_OPCODE_WHILE ||
2881	  inst->opcode == BRW_OPCODE_BREAK ||
2882	  inst->opcode == BRW_OPCODE_CONTINUE) {
2883	 bb_header_ip = ip;
2884      }
2885   }
2886
2887   ralloc_free(this->virtual_grf_def);
2888   ralloc_free(this->virtual_grf_use);
2889   this->virtual_grf_def = def;
2890   this->virtual_grf_use = use;
2891
2892   this->live_intervals_valid = true;
2893}
2894
2895/**
2896 * Attempts to move immediate constants into the immediate
2897 * constant slot of following instructions.
2898 *
2899 * Immediate constants are a bit tricky -- they have to be in the last
2900 * operand slot, you can't do abs/negate on them,
2901 */
2902
2903bool
2904fs_visitor::propagate_constants()
2905{
2906   bool progress = false;
2907
2908   calculate_live_intervals();
2909
2910   foreach_iter(exec_list_iterator, iter, this->instructions) {
2911      fs_inst *inst = (fs_inst *)iter.get();
2912
2913      if (inst->opcode != BRW_OPCODE_MOV ||
2914	  inst->predicated ||
2915	  inst->dst.file != GRF || inst->src[0].file != IMM ||
2916	  inst->dst.type != inst->src[0].type)
2917	 continue;
2918
2919      /* Don't bother with cases where we should have had the
2920       * operation on the constant folded in GLSL already.
2921       */
2922      if (inst->saturate)
2923	 continue;
2924
2925      /* Found a move of a constant to a GRF.  Find anything else using the GRF
2926       * before it's written, and replace it with the constant if we can.
2927       */
2928      exec_list_iterator scan_iter = iter;
2929      scan_iter.next();
2930      for (; scan_iter.has_next(); scan_iter.next()) {
2931	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2932
2933	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2934	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2935	     scan_inst->opcode == BRW_OPCODE_ELSE ||
2936	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2937	    break;
2938	 }
2939
2940	 for (int i = 2; i >= 0; i--) {
2941	    if (scan_inst->src[i].file != GRF ||
2942		scan_inst->src[i].reg != inst->dst.reg ||
2943		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2944	       continue;
2945
2946	    /* Don't bother with cases where we should have had the
2947	     * operation on the constant folded in GLSL already.
2948	     */
2949	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2950	       continue;
2951
2952	    switch (scan_inst->opcode) {
2953	    case BRW_OPCODE_MOV:
2954	       scan_inst->src[i] = inst->src[0];
2955	       progress = true;
2956	       break;
2957
2958	    case BRW_OPCODE_MUL:
2959	    case BRW_OPCODE_ADD:
2960	       if (i == 1) {
2961		  scan_inst->src[i] = inst->src[0];
2962		  progress = true;
2963	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
2964		  /* Fit this constant in by commuting the operands */
2965		  scan_inst->src[0] = scan_inst->src[1];
2966		  scan_inst->src[1] = inst->src[0];
2967		  progress = true;
2968	       }
2969	       break;
2970	    case BRW_OPCODE_CMP:
2971	    case BRW_OPCODE_SEL:
2972	       if (i == 1) {
2973		  scan_inst->src[i] = inst->src[0];
2974		  progress = true;
2975	       }
2976	    }
2977	 }
2978
2979	 if (scan_inst->dst.file == GRF &&
2980	     scan_inst->dst.reg == inst->dst.reg &&
2981	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2982	      scan_inst->is_tex())) {
2983	    break;
2984	 }
2985      }
2986   }
2987
2988   if (progress)
2989       this->live_intervals_valid = false;
2990
2991   return progress;
2992}
2993/**
2994 * Must be called after calculate_live_intervales() to remove unused
2995 * writes to registers -- register allocation will fail otherwise
2996 * because something deffed but not used won't be considered to
2997 * interfere with other regs.
2998 */
2999bool
3000fs_visitor::dead_code_eliminate()
3001{
3002   bool progress = false;
3003   int pc = 0;
3004
3005   calculate_live_intervals();
3006
3007   foreach_iter(exec_list_iterator, iter, this->instructions) {
3008      fs_inst *inst = (fs_inst *)iter.get();
3009
3010      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
3011	 inst->remove();
3012	 progress = true;
3013      }
3014
3015      pc++;
3016   }
3017
3018   if (progress)
3019      live_intervals_valid = false;
3020
3021   return progress;
3022}
3023
3024bool
3025fs_visitor::register_coalesce()
3026{
3027   bool progress = false;
3028   int if_depth = 0;
3029   int loop_depth = 0;
3030
3031   foreach_iter(exec_list_iterator, iter, this->instructions) {
3032      fs_inst *inst = (fs_inst *)iter.get();
3033
3034      /* Make sure that we dominate the instructions we're going to
3035       * scan for interfering with our coalescing, or we won't have
3036       * scanned enough to see if anything interferes with our
3037       * coalescing.  We don't dominate the following instructions if
3038       * we're in a loop or an if block.
3039       */
3040      switch (inst->opcode) {
3041      case BRW_OPCODE_DO:
3042	 loop_depth++;
3043	 break;
3044      case BRW_OPCODE_WHILE:
3045	 loop_depth--;
3046	 break;
3047      case BRW_OPCODE_IF:
3048	 if_depth++;
3049	 break;
3050      case BRW_OPCODE_ENDIF:
3051	 if_depth--;
3052	 break;
3053      }
3054      if (loop_depth || if_depth)
3055	 continue;
3056
3057      if (inst->opcode != BRW_OPCODE_MOV ||
3058	  inst->predicated ||
3059	  inst->saturate ||
3060	  inst->dst.file != GRF || inst->src[0].file != GRF ||
3061	  inst->dst.type != inst->src[0].type)
3062	 continue;
3063
3064      bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
3065
3066      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
3067       * them: check for no writes to either one until the exit of the
3068       * program.
3069       */
3070      bool interfered = false;
3071      exec_list_iterator scan_iter = iter;
3072      scan_iter.next();
3073      for (; scan_iter.has_next(); scan_iter.next()) {
3074	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3075
3076	 if (scan_inst->dst.file == GRF) {
3077	    if (scan_inst->dst.reg == inst->dst.reg &&
3078		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3079		 scan_inst->is_tex())) {
3080	       interfered = true;
3081	       break;
3082	    }
3083	    if (scan_inst->dst.reg == inst->src[0].reg &&
3084		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3085		 scan_inst->is_tex())) {
3086	       interfered = true;
3087	       break;
3088	    }
3089	 }
3090
3091	 /* The gen6 MATH instruction can't handle source modifiers, so avoid
3092	  * coalescing those for now.  We should do something more specific.
3093	  */
3094	 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) {
3095	    interfered = true;
3096	    break;
3097	 }
3098      }
3099      if (interfered) {
3100	 continue;
3101      }
3102
3103      /* Rewrite the later usage to point at the source of the move to
3104       * be removed.
3105       */
3106      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3107	   scan_iter.next()) {
3108	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3109
3110	 for (int i = 0; i < 3; i++) {
3111	    if (scan_inst->src[i].file == GRF &&
3112		scan_inst->src[i].reg == inst->dst.reg &&
3113		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3114	       scan_inst->src[i].reg = inst->src[0].reg;
3115	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3116	       scan_inst->src[i].abs |= inst->src[0].abs;
3117	       scan_inst->src[i].negate ^= inst->src[0].negate;
3118	       scan_inst->src[i].smear = inst->src[0].smear;
3119	    }
3120	 }
3121      }
3122
3123      inst->remove();
3124      progress = true;
3125   }
3126
3127   if (progress)
3128      live_intervals_valid = false;
3129
3130   return progress;
3131}
3132
3133
3134bool
3135fs_visitor::compute_to_mrf()
3136{
3137   bool progress = false;
3138   int next_ip = 0;
3139
3140   calculate_live_intervals();
3141
3142   foreach_iter(exec_list_iterator, iter, this->instructions) {
3143      fs_inst *inst = (fs_inst *)iter.get();
3144
3145      int ip = next_ip;
3146      next_ip++;
3147
3148      if (inst->opcode != BRW_OPCODE_MOV ||
3149	  inst->predicated ||
3150	  inst->dst.file != MRF || inst->src[0].file != GRF ||
3151	  inst->dst.type != inst->src[0].type ||
3152	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3153	 continue;
3154
3155      /* Can't compute-to-MRF this GRF if someone else was going to
3156       * read it later.
3157       */
3158      if (this->virtual_grf_use[inst->src[0].reg] > ip)
3159	 continue;
3160
3161      /* Found a move of a GRF to a MRF.  Let's see if we can go
3162       * rewrite the thing that made this GRF to write into the MRF.
3163       */
3164      fs_inst *scan_inst;
3165      for (scan_inst = (fs_inst *)inst->prev;
3166	   scan_inst->prev != NULL;
3167	   scan_inst = (fs_inst *)scan_inst->prev) {
3168	 if (scan_inst->dst.file == GRF &&
3169	     scan_inst->dst.reg == inst->src[0].reg) {
3170	    /* Found the last thing to write our reg we want to turn
3171	     * into a compute-to-MRF.
3172	     */
3173
3174	    if (scan_inst->is_tex()) {
3175	       /* texturing writes several continuous regs, so we can't
3176		* compute-to-mrf that.
3177		*/
3178	       break;
3179	    }
3180
3181	    /* If it's predicated, it (probably) didn't populate all
3182	     * the channels.
3183	     */
3184	    if (scan_inst->predicated)
3185	       break;
3186
3187	    /* SEND instructions can't have MRF as a destination. */
3188	    if (scan_inst->mlen)
3189	       break;
3190
3191	    if (intel->gen >= 6) {
3192	       /* gen6 math instructions must have the destination be
3193		* GRF, so no compute-to-MRF for them.
3194		*/
3195	       if (scan_inst->is_math()) {
3196		  break;
3197	       }
3198	    }
3199
3200	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3201	       /* Found the creator of our MRF's source value. */
3202	       scan_inst->dst.file = MRF;
3203	       scan_inst->dst.hw_reg = inst->dst.hw_reg;
3204	       scan_inst->saturate |= inst->saturate;
3205	       inst->remove();
3206	       progress = true;
3207	    }
3208	    break;
3209	 }
3210
3211	 /* We don't handle flow control here.  Most computation of
3212	  * values that end up in MRFs are shortly before the MRF
3213	  * write anyway.
3214	  */
3215	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3216	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3217	     scan_inst->opcode == BRW_OPCODE_ELSE ||
3218	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3219	    break;
3220	 }
3221
3222	 /* You can't read from an MRF, so if someone else reads our
3223	  * MRF's source GRF that we wanted to rewrite, that stops us.
3224	  */
3225	 bool interfered = false;
3226	 for (int i = 0; i < 3; i++) {
3227	    if (scan_inst->src[i].file == GRF &&
3228		scan_inst->src[i].reg == inst->src[0].reg &&
3229		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3230	       interfered = true;
3231	    }
3232	 }
3233	 if (interfered)
3234	    break;
3235
3236	 if (scan_inst->dst.file == MRF &&
3237	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3238	    /* Somebody else wrote our MRF here, so we can't can't
3239	     * compute-to-MRF before that.
3240	     */
3241	    break;
3242	 }
3243
3244	 if (scan_inst->mlen > 0) {
3245	    /* Found a SEND instruction, which means that there are
3246	     * live values in MRFs from base_mrf to base_mrf +
3247	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3248	     * above it.
3249	     */
3250	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3251		inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3252	       break;
3253	    }
3254	 }
3255      }
3256   }
3257
3258   return progress;
3259}
3260
3261/**
3262 * Walks through basic blocks, locking for repeated MRF writes and
3263 * removing the later ones.
3264 */
3265bool
3266fs_visitor::remove_duplicate_mrf_writes()
3267{
3268   fs_inst *last_mrf_move[16];
3269   bool progress = false;
3270
3271   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3272
3273   foreach_iter(exec_list_iterator, iter, this->instructions) {
3274      fs_inst *inst = (fs_inst *)iter.get();
3275
3276      switch (inst->opcode) {
3277      case BRW_OPCODE_DO:
3278      case BRW_OPCODE_WHILE:
3279      case BRW_OPCODE_IF:
3280      case BRW_OPCODE_ELSE:
3281      case BRW_OPCODE_ENDIF:
3282	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3283	 continue;
3284      default:
3285	 break;
3286      }
3287
3288      if (inst->opcode == BRW_OPCODE_MOV &&
3289	  inst->dst.file == MRF) {
3290	 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3291	 if (prev_inst && inst->equals(prev_inst)) {
3292	    inst->remove();
3293	    progress = true;
3294	    continue;
3295	 }
3296      }
3297
3298      /* Clear out the last-write records for MRFs that were overwritten. */
3299      if (inst->dst.file == MRF) {
3300	 last_mrf_move[inst->dst.hw_reg] = NULL;
3301      }
3302
3303      if (inst->mlen > 0) {
3304	 /* Found a SEND instruction, which will include two or fewer
3305	  * implied MRF writes.  We could do better here.
3306	  */
3307	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3308	    last_mrf_move[inst->base_mrf + i] = NULL;
3309	 }
3310      }
3311
3312      /* Clear out any MRF move records whose sources got overwritten. */
3313      if (inst->dst.file == GRF) {
3314	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3315	    if (last_mrf_move[i] &&
3316		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3317	       last_mrf_move[i] = NULL;
3318	    }
3319	 }
3320      }
3321
3322      if (inst->opcode == BRW_OPCODE_MOV &&
3323	  inst->dst.file == MRF &&
3324	  inst->src[0].file == GRF &&
3325	  !inst->predicated) {
3326	 last_mrf_move[inst->dst.hw_reg] = inst;
3327      }
3328   }
3329
3330   return progress;
3331}
3332
3333bool
3334fs_visitor::virtual_grf_interferes(int a, int b)
3335{
3336   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3337   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3338
3339   /* We can't handle dead register writes here, without iterating
3340    * over the whole instruction stream to find every single dead
3341    * write to that register to compare to the live interval of the
3342    * other register.  Just assert that dead_code_eliminate() has been
3343    * called.
3344    */
3345   assert((this->virtual_grf_use[a] != -1 ||
3346	   this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3347	  (this->virtual_grf_use[b] != -1 ||
3348	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
3349
3350   return start < end;
3351}
3352
3353static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3354{
3355   struct brw_reg brw_reg;
3356
3357   switch (reg->file) {
3358   case GRF:
3359   case ARF:
3360   case MRF:
3361      if (reg->smear == -1) {
3362	 brw_reg = brw_vec8_reg(reg->file,
3363				reg->hw_reg, 0);
3364      } else {
3365	 brw_reg = brw_vec1_reg(reg->file,
3366				reg->hw_reg, reg->smear);
3367      }
3368      brw_reg = retype(brw_reg, reg->type);
3369      break;
3370   case IMM:
3371      switch (reg->type) {
3372      case BRW_REGISTER_TYPE_F:
3373	 brw_reg = brw_imm_f(reg->imm.f);
3374	 break;
3375      case BRW_REGISTER_TYPE_D:
3376	 brw_reg = brw_imm_d(reg->imm.i);
3377	 break;
3378      case BRW_REGISTER_TYPE_UD:
3379	 brw_reg = brw_imm_ud(reg->imm.u);
3380	 break;
3381      default:
3382	 assert(!"not reached");
3383	 brw_reg = brw_null_reg();
3384	 break;
3385      }
3386      break;
3387   case FIXED_HW_REG:
3388      brw_reg = reg->fixed_hw_reg;
3389      break;
3390   case BAD_FILE:
3391      /* Probably unused. */
3392      brw_reg = brw_null_reg();
3393      break;
3394   case UNIFORM:
3395      assert(!"not reached");
3396      brw_reg = brw_null_reg();
3397      break;
3398   default:
3399      assert(!"not reached");
3400      brw_reg = brw_null_reg();
3401      break;
3402   }
3403   if (reg->abs)
3404      brw_reg = brw_abs(brw_reg);
3405   if (reg->negate)
3406      brw_reg = negate(brw_reg);
3407
3408   return brw_reg;
3409}
3410
3411void
3412fs_visitor::generate_code()
3413{
3414   int last_native_inst = 0;
3415   const char *last_annotation_string = NULL;
3416   ir_instruction *last_annotation_ir = NULL;
3417
3418   int if_stack_array_size = 16;
3419   int loop_stack_array_size = 16;
3420   int if_stack_depth = 0, loop_stack_depth = 0;
3421   brw_instruction **if_stack =
3422      rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size);
3423   brw_instruction **loop_stack =
3424      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
3425   int *if_depth_in_loop =
3426      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
3427
3428
3429   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3430      printf("Native code for fragment shader %d:\n",
3431	     ctx->Shader.CurrentFragmentProgram->Name);
3432   }
3433
3434   foreach_iter(exec_list_iterator, iter, this->instructions) {
3435      fs_inst *inst = (fs_inst *)iter.get();
3436      struct brw_reg src[3], dst;
3437
3438      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3439	 if (last_annotation_ir != inst->ir) {
3440	    last_annotation_ir = inst->ir;
3441	    if (last_annotation_ir) {
3442	       printf("   ");
3443	       last_annotation_ir->print();
3444	       printf("\n");
3445	    }
3446	 }
3447	 if (last_annotation_string != inst->annotation) {
3448	    last_annotation_string = inst->annotation;
3449	    if (last_annotation_string)
3450	       printf("   %s\n", last_annotation_string);
3451	 }
3452      }
3453
3454      for (unsigned int i = 0; i < 3; i++) {
3455	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3456      }
3457      dst = brw_reg_from_fs_reg(&inst->dst);
3458
3459      brw_set_conditionalmod(p, inst->conditional_mod);
3460      brw_set_predicate_control(p, inst->predicated);
3461      brw_set_saturate(p, inst->saturate);
3462
3463      switch (inst->opcode) {
3464      case BRW_OPCODE_MOV:
3465	 brw_MOV(p, dst, src[0]);
3466	 break;
3467      case BRW_OPCODE_ADD:
3468	 brw_ADD(p, dst, src[0], src[1]);
3469	 break;
3470      case BRW_OPCODE_MUL:
3471	 brw_MUL(p, dst, src[0], src[1]);
3472	 break;
3473
3474      case BRW_OPCODE_FRC:
3475	 brw_FRC(p, dst, src[0]);
3476	 break;
3477      case BRW_OPCODE_RNDD:
3478	 brw_RNDD(p, dst, src[0]);
3479	 break;
3480      case BRW_OPCODE_RNDE:
3481	 brw_RNDE(p, dst, src[0]);
3482	 break;
3483      case BRW_OPCODE_RNDZ:
3484	 brw_RNDZ(p, dst, src[0]);
3485	 break;
3486
3487      case BRW_OPCODE_AND:
3488	 brw_AND(p, dst, src[0], src[1]);
3489	 break;
3490      case BRW_OPCODE_OR:
3491	 brw_OR(p, dst, src[0], src[1]);
3492	 break;
3493      case BRW_OPCODE_XOR:
3494	 brw_XOR(p, dst, src[0], src[1]);
3495	 break;
3496      case BRW_OPCODE_NOT:
3497	 brw_NOT(p, dst, src[0]);
3498	 break;
3499      case BRW_OPCODE_ASR:
3500	 brw_ASR(p, dst, src[0], src[1]);
3501	 break;
3502      case BRW_OPCODE_SHR:
3503	 brw_SHR(p, dst, src[0], src[1]);
3504	 break;
3505      case BRW_OPCODE_SHL:
3506	 brw_SHL(p, dst, src[0], src[1]);
3507	 break;
3508
3509      case BRW_OPCODE_CMP:
3510	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3511	 break;
3512      case BRW_OPCODE_SEL:
3513	 brw_SEL(p, dst, src[0], src[1]);
3514	 break;
3515
3516      case BRW_OPCODE_IF:
3517	 if (inst->src[0].file != BAD_FILE) {
3518	    assert(intel->gen >= 6);
3519	    if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]);
3520	 } else {
3521	    if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3522	 }
3523	 if_depth_in_loop[loop_stack_depth]++;
3524	 if_stack_depth++;
3525	 if (if_stack_array_size <= if_stack_depth) {
3526	    if_stack_array_size *= 2;
3527	    if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *,
3528			        if_stack_array_size);
3529	 }
3530	 break;
3531
3532      case BRW_OPCODE_ELSE:
3533	 if_stack[if_stack_depth - 1] =
3534	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
3535	 break;
3536      case BRW_OPCODE_ENDIF:
3537	 if_stack_depth--;
3538	 brw_ENDIF(p , if_stack[if_stack_depth]);
3539	 if_depth_in_loop[loop_stack_depth]--;
3540	 break;
3541
3542      case BRW_OPCODE_DO:
3543	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3544	 if (loop_stack_array_size <= loop_stack_depth) {
3545	    loop_stack_array_size *= 2;
3546	    loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
3547				  loop_stack_array_size);
3548	    if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
3549				        loop_stack_array_size);
3550	 }
3551	 if_depth_in_loop[loop_stack_depth] = 0;
3552	 break;
3553
3554      case BRW_OPCODE_BREAK:
3555	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3556	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3557	 break;
3558      case BRW_OPCODE_CONTINUE:
3559	 /* FINISHME: We need to write the loop instruction support still. */
3560	 if (intel->gen >= 6)
3561	    gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
3562	 else
3563	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3564	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3565	 break;
3566
3567      case BRW_OPCODE_WHILE: {
3568	 struct brw_instruction *inst0, *inst1;
3569	 GLuint br = 1;
3570
3571	 if (intel->gen >= 5)
3572	    br = 2;
3573
3574	 assert(loop_stack_depth > 0);
3575	 loop_stack_depth--;
3576	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3577	 if (intel->gen < 6) {
3578	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
3579	    while (inst0 > loop_stack[loop_stack_depth]) {
3580	       inst0--;
3581	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3582		   inst0->bits3.if_else.jump_count == 0) {
3583		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3584	    }
3585	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3586			inst0->bits3.if_else.jump_count == 0) {
3587		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3588	       }
3589	    }
3590	 }
3591      }
3592	 break;
3593
3594      case FS_OPCODE_RCP:
3595      case FS_OPCODE_RSQ:
3596      case FS_OPCODE_SQRT:
3597      case FS_OPCODE_EXP2:
3598      case FS_OPCODE_LOG2:
3599      case FS_OPCODE_POW:
3600      case FS_OPCODE_SIN:
3601      case FS_OPCODE_COS:
3602	 generate_math(inst, dst, src);
3603	 break;
3604      case FS_OPCODE_CINTERP:
3605	 brw_MOV(p, dst, src[0]);
3606	 break;
3607      case FS_OPCODE_LINTERP:
3608	 generate_linterp(inst, dst, src);
3609	 break;
3610      case FS_OPCODE_TEX:
3611      case FS_OPCODE_TXB:
3612      case FS_OPCODE_TXD:
3613      case FS_OPCODE_TXL:
3614	 generate_tex(inst, dst, src[0]);
3615	 break;
3616      case FS_OPCODE_DISCARD_NOT:
3617	 generate_discard_not(inst, dst);
3618	 break;
3619      case FS_OPCODE_DISCARD_AND:
3620	 generate_discard_and(inst, src[0]);
3621	 break;
3622      case FS_OPCODE_DDX:
3623	 generate_ddx(inst, dst, src[0]);
3624	 break;
3625      case FS_OPCODE_DDY:
3626	 generate_ddy(inst, dst, src[0]);
3627	 break;
3628
3629      case FS_OPCODE_SPILL:
3630	 generate_spill(inst, src[0]);
3631	 break;
3632
3633      case FS_OPCODE_UNSPILL:
3634	 generate_unspill(inst, dst);
3635	 break;
3636
3637      case FS_OPCODE_PULL_CONSTANT_LOAD:
3638	 generate_pull_constant_load(inst, dst);
3639	 break;
3640
3641      case FS_OPCODE_FB_WRITE:
3642	 generate_fb_write(inst);
3643	 break;
3644      default:
3645	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3646	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3647			  brw_opcodes[inst->opcode].name);
3648	 } else {
3649	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3650	 }
3651	 fail("unsupported opcode in FS\n");
3652      }
3653
3654      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3655	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3656	    if (0) {
3657	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3658		      ((uint32_t *)&p->store[i])[3],
3659		      ((uint32_t *)&p->store[i])[2],
3660		      ((uint32_t *)&p->store[i])[1],
3661		      ((uint32_t *)&p->store[i])[0]);
3662	    }
3663	    brw_disasm(stdout, &p->store[i], intel->gen);
3664	 }
3665      }
3666
3667      last_native_inst = p->nr_insn;
3668   }
3669
3670   ralloc_free(if_stack);
3671   ralloc_free(loop_stack);
3672   ralloc_free(if_depth_in_loop);
3673
3674   brw_set_uip_jip(p);
3675
3676   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3677    * emit issues, it doesn't get the jump distances into the output,
3678    * which is often something we want to debug.  So this is here in
3679    * case you're doing that.
3680    */
3681   if (0) {
3682      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3683	 for (unsigned int i = 0; i < p->nr_insn; i++) {
3684	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3685		   ((uint32_t *)&p->store[i])[3],
3686		   ((uint32_t *)&p->store[i])[2],
3687		   ((uint32_t *)&p->store[i])[1],
3688		   ((uint32_t *)&p->store[i])[0]);
3689	    brw_disasm(stdout, &p->store[i], intel->gen);
3690	 }
3691      }
3692   }
3693}
3694
3695GLboolean
3696brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3697{
3698   struct intel_context *intel = &brw->intel;
3699   struct gl_context *ctx = &intel->ctx;
3700   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3701
3702   if (!prog)
3703      return GL_FALSE;
3704
3705   struct brw_shader *shader =
3706     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3707   if (!shader)
3708      return GL_FALSE;
3709
3710   /* We always use 8-wide mode, at least for now.  For one, flow
3711    * control only works in 8-wide.  Also, when we're fragment shader
3712    * bound, we're almost always under register pressure as well, so
3713    * 8-wide would save us from the performance cliff of spilling
3714    * regs.
3715    */
3716   c->dispatch_width = 8;
3717
3718   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3719      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3720      _mesa_print_ir(shader->ir, NULL);
3721      printf("\n");
3722   }
3723
3724   /* Now the main event: Visit the shader IR and generate our FS IR for it.
3725    */
3726   fs_visitor v(c, shader);
3727
3728   if (0) {
3729      v.emit_dummy_fs();
3730   } else {
3731      v.calculate_urb_setup();
3732      if (intel->gen < 6)
3733	 v.emit_interpolation_setup_gen4();
3734      else
3735	 v.emit_interpolation_setup_gen6();
3736
3737      /* Generate FS IR for main().  (the visitor only descends into
3738       * functions called "main").
3739       */
3740      foreach_iter(exec_list_iterator, iter, *shader->ir) {
3741	 ir_instruction *ir = (ir_instruction *)iter.get();
3742	 v.base_ir = ir;
3743	 ir->accept(&v);
3744      }
3745
3746      v.emit_fb_writes();
3747
3748      v.split_virtual_grfs();
3749
3750      v.setup_paramvalues_refs();
3751      v.setup_pull_constants();
3752
3753      bool progress;
3754      do {
3755	 progress = false;
3756
3757	 progress = v.remove_duplicate_mrf_writes() || progress;
3758
3759	 progress = v.propagate_constants() || progress;
3760	 progress = v.register_coalesce() || progress;
3761	 progress = v.compute_to_mrf() || progress;
3762	 progress = v.dead_code_eliminate() || progress;
3763      } while (progress);
3764
3765      v.schedule_instructions();
3766
3767      v.assign_curb_setup();
3768      v.assign_urb_setup();
3769
3770      if (0) {
3771	 /* Debug of register spilling: Go spill everything. */
3772	 int virtual_grf_count = v.virtual_grf_next;
3773	 for (int i = 1; i < virtual_grf_count; i++) {
3774	    v.spill_reg(i);
3775	 }
3776      }
3777
3778      if (0)
3779	 v.assign_regs_trivial();
3780      else {
3781	 while (!v.assign_regs()) {
3782	    if (v.failed)
3783	       break;
3784	 }
3785      }
3786   }
3787
3788   if (!v.failed)
3789      v.generate_code();
3790
3791   assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */
3792
3793   if (v.failed)
3794      return GL_FALSE;
3795
3796   c->prog_data.total_grf = v.grf_used;
3797
3798   return GL_TRUE;
3799}
3800