brw_fs.cpp revision b943b9b1a696cf51adfb2a18bcb9cf503fb2737f
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44}
45#include "brw_fs.h"
46#include "../glsl/glsl_types.h"
47#include "../glsl/ir_optimization.h"
48#include "../glsl/ir_print_visitor.h"
49
50#define MAX_INSTRUCTION (1 << 30)
51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53struct gl_shader *
54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55{
56   struct brw_shader *shader;
57
58   shader = rzalloc(NULL, struct brw_shader);
59   if (shader) {
60      shader->base.Type = type;
61      shader->base.Name = name;
62      _mesa_init_shader(ctx, &shader->base);
63   }
64
65   return &shader->base;
66}
67
68struct gl_shader_program *
69brw_new_shader_program(struct gl_context *ctx, GLuint name)
70{
71   struct brw_shader_program *prog;
72   prog = rzalloc(NULL, struct brw_shader_program);
73   if (prog) {
74      prog->base.Name = name;
75      _mesa_init_shader_program(ctx, &prog->base);
76   }
77   return &prog->base;
78}
79
80GLboolean
81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
82{
83   struct brw_context *brw = brw_context(ctx);
84   struct intel_context *intel = &brw->intel;
85
86   struct brw_shader *shader =
87      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
88   if (shader != NULL) {
89      void *mem_ctx = ralloc_context(NULL);
90      bool progress;
91
92      if (shader->ir)
93	 ralloc_free(shader->ir);
94      shader->ir = new(shader) exec_list;
95      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
96
97      do_mat_op_to_vec(shader->ir);
98      lower_instructions(shader->ir,
99			 MOD_TO_FRACT |
100			 DIV_TO_MUL_RCP |
101			 SUB_TO_ADD_NEG |
102			 EXP_TO_EXP2 |
103			 LOG_TO_LOG2);
104
105      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
106       * if-statements need to be flattened.
107       */
108      if (intel->gen < 6)
109	 lower_if_to_cond_assign(shader->ir, 16);
110
111      do_lower_texture_projection(shader->ir);
112      do_vec_index_to_cond_assign(shader->ir);
113      brw_do_cubemap_normalize(shader->ir);
114      lower_noise(shader->ir);
115      lower_quadop_vector(shader->ir, false);
116      lower_variable_index_to_cond_assign(shader->ir,
117					  GL_TRUE, /* input */
118					  GL_TRUE, /* output */
119					  GL_TRUE, /* temp */
120					  GL_TRUE /* uniform */
121					  );
122
123      do {
124	 progress = false;
125
126	 brw_do_channel_expressions(shader->ir);
127	 brw_do_vector_splitting(shader->ir);
128
129	 progress = do_lower_jumps(shader->ir, true, true,
130				   true, /* main return */
131				   false, /* continue */
132				   false /* loops */
133				   ) || progress;
134
135	 progress = do_common_optimization(shader->ir, true, 32) || progress;
136      } while (progress);
137
138      validate_ir_tree(shader->ir);
139
140      reparent_ir(shader->ir, shader->ir);
141      ralloc_free(mem_ctx);
142   }
143
144   if (!_mesa_ir_link_shader(ctx, prog))
145      return GL_FALSE;
146
147   return GL_TRUE;
148}
149
150static int
151type_size(const struct glsl_type *type)
152{
153   unsigned int size, i;
154
155   switch (type->base_type) {
156   case GLSL_TYPE_UINT:
157   case GLSL_TYPE_INT:
158   case GLSL_TYPE_FLOAT:
159   case GLSL_TYPE_BOOL:
160      return type->components();
161   case GLSL_TYPE_ARRAY:
162      return type_size(type->fields.array) * type->length;
163   case GLSL_TYPE_STRUCT:
164      size = 0;
165      for (i = 0; i < type->length; i++) {
166	 size += type_size(type->fields.structure[i].type);
167      }
168      return size;
169   case GLSL_TYPE_SAMPLER:
170      /* Samplers take up no register space, since they're baked in at
171       * link time.
172       */
173      return 0;
174   default:
175      assert(!"not reached");
176      return 0;
177   }
178}
179
180void
181fs_visitor::fail(const char *format, ...)
182{
183   if (!failed) {
184      failed = true;
185
186      if (INTEL_DEBUG & DEBUG_WM) {
187	 fprintf(stderr, "FS compile failed: ");
188
189	 va_list va;
190	 va_start(va, format);
191	 vfprintf(stderr, format, va);
192	 va_end(va);
193      }
194   }
195}
196
197void
198fs_visitor::push_force_uncompressed()
199{
200   force_uncompressed_stack++;
201}
202
203void
204fs_visitor::pop_force_uncompressed()
205{
206   force_uncompressed_stack--;
207   assert(force_uncompressed_stack >= 0);
208}
209
210void
211fs_visitor::push_force_sechalf()
212{
213   force_sechalf_stack++;
214}
215
216void
217fs_visitor::pop_force_sechalf()
218{
219   force_sechalf_stack--;
220   assert(force_sechalf_stack >= 0);
221}
222
223/**
224 * Returns how many MRFs an FS opcode will write over.
225 *
226 * Note that this is not the 0 or 1 implied writes in an actual gen
227 * instruction -- the FS opcodes often generate MOVs in addition.
228 */
229int
230fs_visitor::implied_mrf_writes(fs_inst *inst)
231{
232   if (inst->mlen == 0)
233      return 0;
234
235   switch (inst->opcode) {
236   case FS_OPCODE_RCP:
237   case FS_OPCODE_RSQ:
238   case FS_OPCODE_SQRT:
239   case FS_OPCODE_EXP2:
240   case FS_OPCODE_LOG2:
241   case FS_OPCODE_SIN:
242   case FS_OPCODE_COS:
243      return 1 * c->dispatch_width / 8;
244   case FS_OPCODE_POW:
245      return 2 * c->dispatch_width / 8;
246   case FS_OPCODE_TEX:
247   case FS_OPCODE_TXB:
248   case FS_OPCODE_TXD:
249   case FS_OPCODE_TXL:
250      return 1;
251   case FS_OPCODE_FB_WRITE:
252      return 2;
253   case FS_OPCODE_PULL_CONSTANT_LOAD:
254   case FS_OPCODE_UNSPILL:
255      return 1;
256   case FS_OPCODE_SPILL:
257      return 2;
258   default:
259      assert(!"not reached");
260      return inst->mlen;
261   }
262}
263
264int
265fs_visitor::virtual_grf_alloc(int size)
266{
267   if (virtual_grf_array_size <= virtual_grf_next) {
268      if (virtual_grf_array_size == 0)
269	 virtual_grf_array_size = 16;
270      else
271	 virtual_grf_array_size *= 2;
272      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
273				   virtual_grf_array_size);
274
275      /* This slot is always unused. */
276      virtual_grf_sizes[0] = 0;
277   }
278   virtual_grf_sizes[virtual_grf_next] = size;
279   return virtual_grf_next++;
280}
281
282/** Fixed HW reg constructor. */
283fs_reg::fs_reg(enum register_file file, int hw_reg)
284{
285   init();
286   this->file = file;
287   this->hw_reg = hw_reg;
288   this->type = BRW_REGISTER_TYPE_F;
289}
290
291/** Fixed HW reg constructor. */
292fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
293{
294   init();
295   this->file = file;
296   this->hw_reg = hw_reg;
297   this->type = type;
298}
299
300int
301brw_type_for_base_type(const struct glsl_type *type)
302{
303   switch (type->base_type) {
304   case GLSL_TYPE_FLOAT:
305      return BRW_REGISTER_TYPE_F;
306   case GLSL_TYPE_INT:
307   case GLSL_TYPE_BOOL:
308      return BRW_REGISTER_TYPE_D;
309   case GLSL_TYPE_UINT:
310      return BRW_REGISTER_TYPE_UD;
311   case GLSL_TYPE_ARRAY:
312   case GLSL_TYPE_STRUCT:
313   case GLSL_TYPE_SAMPLER:
314      /* These should be overridden with the type of the member when
315       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
316       * way to trip up if we don't.
317       */
318      return BRW_REGISTER_TYPE_UD;
319   default:
320      assert(!"not reached");
321      return BRW_REGISTER_TYPE_F;
322   }
323}
324
325/** Automatic reg constructor. */
326fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
327{
328   init();
329
330   this->file = GRF;
331   this->reg = v->virtual_grf_alloc(type_size(type));
332   this->reg_offset = 0;
333   this->type = brw_type_for_base_type(type);
334}
335
336fs_reg *
337fs_visitor::variable_storage(ir_variable *var)
338{
339   return (fs_reg *)hash_table_find(this->variable_ht, var);
340}
341
342/* Our support for uniforms is piggy-backed on the struct
343 * gl_fragment_program, because that's where the values actually
344 * get stored, rather than in some global gl_shader_program uniform
345 * store.
346 */
347int
348fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
349{
350   unsigned int offset = 0;
351
352   if (type->is_matrix()) {
353      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
354							type->vector_elements,
355							1);
356
357      for (unsigned int i = 0; i < type->matrix_columns; i++) {
358	 offset += setup_uniform_values(loc + offset, column);
359      }
360
361      return offset;
362   }
363
364   switch (type->base_type) {
365   case GLSL_TYPE_FLOAT:
366   case GLSL_TYPE_UINT:
367   case GLSL_TYPE_INT:
368   case GLSL_TYPE_BOOL:
369      for (unsigned int i = 0; i < type->vector_elements; i++) {
370	 unsigned int param = c->prog_data.nr_params++;
371
372	 assert(param < ARRAY_SIZE(c->prog_data.param));
373
374	 switch (type->base_type) {
375	 case GLSL_TYPE_FLOAT:
376	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
377	    break;
378	 case GLSL_TYPE_UINT:
379	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
380	    break;
381	 case GLSL_TYPE_INT:
382	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
383	    break;
384	 case GLSL_TYPE_BOOL:
385	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
386	    break;
387	 default:
388	    assert(!"not reached");
389	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
390	    break;
391	 }
392	 this->param_index[param] = loc;
393	 this->param_offset[param] = i;
394      }
395      return 1;
396
397   case GLSL_TYPE_STRUCT:
398      for (unsigned int i = 0; i < type->length; i++) {
399	 offset += setup_uniform_values(loc + offset,
400					type->fields.structure[i].type);
401      }
402      return offset;
403
404   case GLSL_TYPE_ARRAY:
405      for (unsigned int i = 0; i < type->length; i++) {
406	 offset += setup_uniform_values(loc + offset, type->fields.array);
407      }
408      return offset;
409
410   case GLSL_TYPE_SAMPLER:
411      /* The sampler takes up a slot, but we don't use any values from it. */
412      return 1;
413
414   default:
415      assert(!"not reached");
416      return 0;
417   }
418}
419
420
421/* Our support for builtin uniforms is even scarier than non-builtin.
422 * It sits on top of the PROG_STATE_VAR parameters that are
423 * automatically updated from GL context state.
424 */
425void
426fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
427{
428   const ir_state_slot *const slots = ir->state_slots;
429   assert(ir->state_slots != NULL);
430
431   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
432      /* This state reference has already been setup by ir_to_mesa, but we'll
433       * get the same index back here.
434       */
435      int index = _mesa_add_state_reference(this->fp->Base.Parameters,
436					    (gl_state_index *)slots[i].tokens);
437
438      /* Add each of the unique swizzles of the element as a parameter.
439       * This'll end up matching the expected layout of the
440       * array/matrix/structure we're trying to fill in.
441       */
442      int last_swiz = -1;
443      for (unsigned int j = 0; j < 4; j++) {
444	 int swiz = GET_SWZ(slots[i].swizzle, j);
445	 if (swiz == last_swiz)
446	    break;
447	 last_swiz = swiz;
448
449	 c->prog_data.param_convert[c->prog_data.nr_params] =
450	    PARAM_NO_CONVERT;
451	 this->param_index[c->prog_data.nr_params] = index;
452	 this->param_offset[c->prog_data.nr_params] = swiz;
453	 c->prog_data.nr_params++;
454      }
455   }
456}
457
458fs_reg *
459fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
460{
461   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
462   fs_reg wpos = *reg;
463   fs_reg neg_y = this->pixel_y;
464   neg_y.negate = true;
465   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
466
467   /* gl_FragCoord.x */
468   if (ir->pixel_center_integer) {
469      emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
470   } else {
471      emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
472   }
473   wpos.reg_offset++;
474
475   /* gl_FragCoord.y */
476   if (!flip && ir->pixel_center_integer) {
477      emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
478   } else {
479      fs_reg pixel_y = this->pixel_y;
480      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
481
482      if (flip) {
483	 pixel_y.negate = true;
484	 offset += c->key.drawable_height - 1.0;
485      }
486
487      emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
488   }
489   wpos.reg_offset++;
490
491   /* gl_FragCoord.z */
492   if (intel->gen >= 6) {
493      emit(BRW_OPCODE_MOV, wpos,
494	   fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
495   } else {
496      emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
497	   interp_reg(FRAG_ATTRIB_WPOS, 2));
498   }
499   wpos.reg_offset++;
500
501   /* gl_FragCoord.w: Already set up in emit_interpolation */
502   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
503
504   return reg;
505}
506
507fs_reg *
508fs_visitor::emit_general_interpolation(ir_variable *ir)
509{
510   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
511   /* Interpolation is always in floating point regs. */
512   reg->type = BRW_REGISTER_TYPE_F;
513   fs_reg attr = *reg;
514
515   unsigned int array_elements;
516   const glsl_type *type;
517
518   if (ir->type->is_array()) {
519      array_elements = ir->type->length;
520      if (array_elements == 0) {
521	 fail("dereferenced array '%s' has length 0\n", ir->name);
522      }
523      type = ir->type->fields.array;
524   } else {
525      array_elements = 1;
526      type = ir->type;
527   }
528
529   int location = ir->location;
530   for (unsigned int i = 0; i < array_elements; i++) {
531      for (unsigned int j = 0; j < type->matrix_columns; j++) {
532	 if (urb_setup[location] == -1) {
533	    /* If there's no incoming setup data for this slot, don't
534	     * emit interpolation for it.
535	     */
536	    attr.reg_offset += type->vector_elements;
537	    location++;
538	    continue;
539	 }
540
541	 bool is_gl_Color =
542	    location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
543
544	 if (c->key.flat_shade && is_gl_Color) {
545	    /* Constant interpolation (flat shading) case. The SF has
546	     * handed us defined values in only the constant offset
547	     * field of the setup reg.
548	     */
549	    for (unsigned int k = 0; k < type->vector_elements; k++) {
550	       struct brw_reg interp = interp_reg(location, k);
551	       interp = suboffset(interp, 3);
552	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
553	       attr.reg_offset++;
554	    }
555	 } else {
556	    /* Perspective interpolation case. */
557	    for (unsigned int k = 0; k < type->vector_elements; k++) {
558	       struct brw_reg interp = interp_reg(location, k);
559	       emit(FS_OPCODE_LINTERP, attr,
560		    this->delta_x, this->delta_y, fs_reg(interp));
561	       attr.reg_offset++;
562	    }
563
564	    if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) {
565	       attr.reg_offset -= type->vector_elements;
566	       for (unsigned int k = 0; k < type->vector_elements; k++) {
567		  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
568		  attr.reg_offset++;
569	       }
570	    }
571	 }
572	 location++;
573      }
574   }
575
576   return reg;
577}
578
579fs_reg *
580fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
581{
582   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
583
584   /* The frontfacing comes in as a bit in the thread payload. */
585   if (intel->gen >= 6) {
586      emit(BRW_OPCODE_ASR, *reg,
587	   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
588	   fs_reg(15));
589      emit(BRW_OPCODE_NOT, *reg, *reg);
590      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
591   } else {
592      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
593      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
594       * us front face
595       */
596      fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
597			   fs_reg(r1_6ud),
598			   fs_reg(1u << 31));
599      inst->conditional_mod = BRW_CONDITIONAL_L;
600      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
601   }
602
603   return reg;
604}
605
606fs_inst *
607fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
608{
609   switch (opcode) {
610   case FS_OPCODE_RCP:
611   case FS_OPCODE_RSQ:
612   case FS_OPCODE_SQRT:
613   case FS_OPCODE_EXP2:
614   case FS_OPCODE_LOG2:
615   case FS_OPCODE_SIN:
616   case FS_OPCODE_COS:
617      break;
618   default:
619      assert(!"not reached: bad math opcode");
620      return NULL;
621   }
622
623   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
624    * might be able to do better by doing execsize = 1 math and then
625    * expanding that result out, but we would need to be careful with
626    * masking.
627    *
628    * The hardware ignores source modifiers (negate and abs) on math
629    * instructions, so we also move to a temp to set those up.
630    */
631   if (intel->gen >= 6 && (src.file == UNIFORM ||
632			   src.abs ||
633			   src.negate)) {
634      fs_reg expanded = fs_reg(this, glsl_type::float_type);
635      emit(BRW_OPCODE_MOV, expanded, src);
636      src = expanded;
637   }
638
639   fs_inst *inst = emit(opcode, dst, src);
640
641   if (intel->gen < 6) {
642      inst->base_mrf = 2;
643      inst->mlen = c->dispatch_width / 8;
644   }
645
646   return inst;
647}
648
649fs_inst *
650fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
651{
652   int base_mrf = 2;
653   fs_inst *inst;
654
655   assert(opcode == FS_OPCODE_POW);
656
657   if (intel->gen >= 6) {
658      /* Can't do hstride == 0 args to gen6 math, so expand it out.
659       *
660       * The hardware ignores source modifiers (negate and abs) on math
661       * instructions, so we also move to a temp to set those up.
662       */
663      if (src0.file == UNIFORM || src0.abs || src0.negate) {
664	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
665	 emit(BRW_OPCODE_MOV, expanded, src0);
666	 src0 = expanded;
667      }
668
669      if (src1.file == UNIFORM || src1.abs || src1.negate) {
670	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
671	 emit(BRW_OPCODE_MOV, expanded, src1);
672	 src1 = expanded;
673      }
674
675      inst = emit(opcode, dst, src0, src1);
676   } else {
677      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
678      inst = emit(opcode, dst, src0, reg_null_f);
679
680      inst->base_mrf = base_mrf;
681      inst->mlen = 2 * c->dispatch_width / 8;
682   }
683   return inst;
684}
685
686void
687fs_visitor::visit(ir_variable *ir)
688{
689   fs_reg *reg = NULL;
690
691   if (variable_storage(ir))
692      return;
693
694   if (strcmp(ir->name, "gl_FragColor") == 0) {
695      this->frag_color = ir;
696   } else if (strcmp(ir->name, "gl_FragData") == 0) {
697      this->frag_data = ir;
698   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
699      this->frag_depth = ir;
700   }
701
702   if (ir->mode == ir_var_in) {
703      if (!strcmp(ir->name, "gl_FragCoord")) {
704	 reg = emit_fragcoord_interpolation(ir);
705      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
706	 reg = emit_frontfacing_interpolation(ir);
707      } else {
708	 reg = emit_general_interpolation(ir);
709      }
710      assert(reg);
711      hash_table_insert(this->variable_ht, reg, ir);
712      return;
713   }
714
715   if (ir->mode == ir_var_uniform) {
716      int param_index = c->prog_data.nr_params;
717
718      if (!strncmp(ir->name, "gl_", 3)) {
719	 setup_builtin_uniform_values(ir);
720      } else {
721	 setup_uniform_values(ir->location, ir->type);
722      }
723
724      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
725      reg->type = brw_type_for_base_type(ir->type);
726   }
727
728   if (!reg)
729      reg = new(this->mem_ctx) fs_reg(this, ir->type);
730
731   hash_table_insert(this->variable_ht, reg, ir);
732}
733
734void
735fs_visitor::visit(ir_dereference_variable *ir)
736{
737   fs_reg *reg = variable_storage(ir->var);
738   this->result = *reg;
739}
740
741void
742fs_visitor::visit(ir_dereference_record *ir)
743{
744   const glsl_type *struct_type = ir->record->type;
745
746   ir->record->accept(this);
747
748   unsigned int offset = 0;
749   for (unsigned int i = 0; i < struct_type->length; i++) {
750      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
751	 break;
752      offset += type_size(struct_type->fields.structure[i].type);
753   }
754   this->result.reg_offset += offset;
755   this->result.type = brw_type_for_base_type(ir->type);
756}
757
758void
759fs_visitor::visit(ir_dereference_array *ir)
760{
761   ir_constant *index;
762   int element_size;
763
764   ir->array->accept(this);
765   index = ir->array_index->as_constant();
766
767   element_size = type_size(ir->type);
768   this->result.type = brw_type_for_base_type(ir->type);
769
770   if (index) {
771      assert(this->result.file == UNIFORM ||
772	     (this->result.file == GRF &&
773	      this->result.reg != 0));
774      this->result.reg_offset += index->value.i[0] * element_size;
775   } else {
776      assert(!"FINISHME: non-constant array element");
777   }
778}
779
780/* Instruction selection: Produce a MOV.sat instead of
781 * MIN(MAX(val, 0), 1) when possible.
782 */
783bool
784fs_visitor::try_emit_saturate(ir_expression *ir)
785{
786   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
787
788   if (!sat_val)
789      return false;
790
791   sat_val->accept(this);
792   fs_reg src = this->result;
793
794   this->result = fs_reg(this, ir->type);
795   fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
796   inst->saturate = true;
797
798   return true;
799}
800
801static uint32_t
802brw_conditional_for_comparison(unsigned int op)
803{
804   switch (op) {
805   case ir_binop_less:
806      return BRW_CONDITIONAL_L;
807   case ir_binop_greater:
808      return BRW_CONDITIONAL_G;
809   case ir_binop_lequal:
810      return BRW_CONDITIONAL_LE;
811   case ir_binop_gequal:
812      return BRW_CONDITIONAL_GE;
813   case ir_binop_equal:
814   case ir_binop_all_equal: /* same as equal for scalars */
815      return BRW_CONDITIONAL_Z;
816   case ir_binop_nequal:
817   case ir_binop_any_nequal: /* same as nequal for scalars */
818      return BRW_CONDITIONAL_NZ;
819   default:
820      assert(!"not reached: bad operation for comparison");
821      return BRW_CONDITIONAL_NZ;
822   }
823}
824
825void
826fs_visitor::visit(ir_expression *ir)
827{
828   unsigned int operand;
829   fs_reg op[2], temp;
830   fs_inst *inst;
831
832   assert(ir->get_num_operands() <= 2);
833
834   if (try_emit_saturate(ir))
835      return;
836
837   for (operand = 0; operand < ir->get_num_operands(); operand++) {
838      ir->operands[operand]->accept(this);
839      if (this->result.file == BAD_FILE) {
840	 ir_print_visitor v;
841	 fail("Failed to get tree for expression operand:\n");
842	 ir->operands[operand]->accept(&v);
843      }
844      op[operand] = this->result;
845
846      /* Matrix expression operands should have been broken down to vector
847       * operations already.
848       */
849      assert(!ir->operands[operand]->type->is_matrix());
850      /* And then those vector operands should have been broken down to scalar.
851       */
852      assert(!ir->operands[operand]->type->is_vector());
853   }
854
855   /* Storage for our result.  If our result goes into an assignment, it will
856    * just get copy-propagated out, so no worries.
857    */
858   this->result = fs_reg(this, ir->type);
859
860   switch (ir->operation) {
861   case ir_unop_logic_not:
862      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
863       * ones complement of the whole register, not just bit 0.
864       */
865      emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
866      break;
867   case ir_unop_neg:
868      op[0].negate = !op[0].negate;
869      this->result = op[0];
870      break;
871   case ir_unop_abs:
872      op[0].abs = true;
873      op[0].negate = false;
874      this->result = op[0];
875      break;
876   case ir_unop_sign:
877      temp = fs_reg(this, ir->type);
878
879      emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
880
881      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
882      inst->conditional_mod = BRW_CONDITIONAL_G;
883      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
884      inst->predicated = true;
885
886      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
887      inst->conditional_mod = BRW_CONDITIONAL_L;
888      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
889      inst->predicated = true;
890
891      break;
892   case ir_unop_rcp:
893      emit_math(FS_OPCODE_RCP, this->result, op[0]);
894      break;
895
896   case ir_unop_exp2:
897      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
898      break;
899   case ir_unop_log2:
900      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
901      break;
902   case ir_unop_exp:
903   case ir_unop_log:
904      assert(!"not reached: should be handled by ir_explog_to_explog2");
905      break;
906   case ir_unop_sin:
907   case ir_unop_sin_reduced:
908      emit_math(FS_OPCODE_SIN, this->result, op[0]);
909      break;
910   case ir_unop_cos:
911   case ir_unop_cos_reduced:
912      emit_math(FS_OPCODE_COS, this->result, op[0]);
913      break;
914
915   case ir_unop_dFdx:
916      emit(FS_OPCODE_DDX, this->result, op[0]);
917      break;
918   case ir_unop_dFdy:
919      emit(FS_OPCODE_DDY, this->result, op[0]);
920      break;
921
922   case ir_binop_add:
923      emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
924      break;
925   case ir_binop_sub:
926      assert(!"not reached: should be handled by ir_sub_to_add_neg");
927      break;
928
929   case ir_binop_mul:
930      emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
931      break;
932   case ir_binop_div:
933      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
934      break;
935   case ir_binop_mod:
936      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
937      break;
938
939   case ir_binop_less:
940   case ir_binop_greater:
941   case ir_binop_lequal:
942   case ir_binop_gequal:
943   case ir_binop_equal:
944   case ir_binop_all_equal:
945   case ir_binop_nequal:
946   case ir_binop_any_nequal:
947      temp = this->result;
948      /* original gen4 does implicit conversion before comparison. */
949      if (intel->gen < 5)
950	 temp.type = op[0].type;
951
952      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
953      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
954      emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
955      break;
956
957   case ir_binop_logic_xor:
958      emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
959      break;
960
961   case ir_binop_logic_or:
962      emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
963      break;
964
965   case ir_binop_logic_and:
966      emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
967      break;
968
969   case ir_binop_dot:
970   case ir_unop_any:
971      assert(!"not reached: should be handled by brw_fs_channel_expressions");
972      break;
973
974   case ir_unop_noise:
975      assert(!"not reached: should be handled by lower_noise");
976      break;
977
978   case ir_quadop_vector:
979      assert(!"not reached: should be handled by lower_quadop_vector");
980      break;
981
982   case ir_unop_sqrt:
983      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
984      break;
985
986   case ir_unop_rsq:
987      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
988      break;
989
990   case ir_unop_i2f:
991   case ir_unop_b2f:
992   case ir_unop_b2i:
993   case ir_unop_f2i:
994      emit(BRW_OPCODE_MOV, this->result, op[0]);
995      break;
996   case ir_unop_f2b:
997   case ir_unop_i2b:
998      temp = this->result;
999      /* original gen4 does implicit conversion before comparison. */
1000      if (intel->gen < 5)
1001	 temp.type = op[0].type;
1002
1003      inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
1004      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1005      inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
1006      break;
1007
1008   case ir_unop_trunc:
1009      emit(BRW_OPCODE_RNDZ, this->result, op[0]);
1010      break;
1011   case ir_unop_ceil:
1012      op[0].negate = !op[0].negate;
1013      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1014      this->result.negate = true;
1015      break;
1016   case ir_unop_floor:
1017      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1018      break;
1019   case ir_unop_fract:
1020      inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
1021      break;
1022   case ir_unop_round_even:
1023      emit(BRW_OPCODE_RNDE, this->result, op[0]);
1024      break;
1025
1026   case ir_binop_min:
1027      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1028      inst->conditional_mod = BRW_CONDITIONAL_L;
1029
1030      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1031      inst->predicated = true;
1032      break;
1033   case ir_binop_max:
1034      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1035      inst->conditional_mod = BRW_CONDITIONAL_G;
1036
1037      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1038      inst->predicated = true;
1039      break;
1040
1041   case ir_binop_pow:
1042      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1043      break;
1044
1045   case ir_unop_bit_not:
1046      inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
1047      break;
1048   case ir_binop_bit_and:
1049      inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
1050      break;
1051   case ir_binop_bit_xor:
1052      inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
1053      break;
1054   case ir_binop_bit_or:
1055      inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
1056      break;
1057
1058   case ir_unop_u2f:
1059   case ir_binop_lshift:
1060   case ir_binop_rshift:
1061      assert(!"GLSL 1.30 features unsupported");
1062      break;
1063   }
1064}
1065
1066void
1067fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1068				   const glsl_type *type, bool predicated)
1069{
1070   switch (type->base_type) {
1071   case GLSL_TYPE_FLOAT:
1072   case GLSL_TYPE_UINT:
1073   case GLSL_TYPE_INT:
1074   case GLSL_TYPE_BOOL:
1075      for (unsigned int i = 0; i < type->components(); i++) {
1076	 l.type = brw_type_for_base_type(type);
1077	 r.type = brw_type_for_base_type(type);
1078
1079	 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
1080	 inst->predicated = predicated;
1081
1082	 l.reg_offset++;
1083	 r.reg_offset++;
1084      }
1085      break;
1086   case GLSL_TYPE_ARRAY:
1087      for (unsigned int i = 0; i < type->length; i++) {
1088	 emit_assignment_writes(l, r, type->fields.array, predicated);
1089      }
1090      break;
1091
1092   case GLSL_TYPE_STRUCT:
1093      for (unsigned int i = 0; i < type->length; i++) {
1094	 emit_assignment_writes(l, r, type->fields.structure[i].type,
1095				predicated);
1096      }
1097      break;
1098
1099   case GLSL_TYPE_SAMPLER:
1100      break;
1101
1102   default:
1103      assert(!"not reached");
1104      break;
1105   }
1106}
1107
1108void
1109fs_visitor::visit(ir_assignment *ir)
1110{
1111   struct fs_reg l, r;
1112   fs_inst *inst;
1113
1114   /* FINISHME: arrays on the lhs */
1115   ir->lhs->accept(this);
1116   l = this->result;
1117
1118   ir->rhs->accept(this);
1119   r = this->result;
1120
1121   assert(l.file != BAD_FILE);
1122   assert(r.file != BAD_FILE);
1123
1124   if (ir->condition) {
1125      emit_bool_to_cond_code(ir->condition);
1126   }
1127
1128   if (ir->lhs->type->is_scalar() ||
1129       ir->lhs->type->is_vector()) {
1130      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1131	 if (ir->write_mask & (1 << i)) {
1132	    inst = emit(BRW_OPCODE_MOV, l, r);
1133	    if (ir->condition)
1134	       inst->predicated = true;
1135	    r.reg_offset++;
1136	 }
1137	 l.reg_offset++;
1138      }
1139   } else {
1140      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1141   }
1142}
1143
1144fs_inst *
1145fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1146{
1147   int mlen;
1148   int base_mrf = 1;
1149   bool simd16 = false;
1150   fs_reg orig_dst;
1151
1152   /* g0 header. */
1153   mlen = 1;
1154
1155   if (ir->shadow_comparitor) {
1156      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1157	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1158	 coordinate.reg_offset++;
1159      }
1160      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1161      mlen += 3;
1162
1163      if (ir->op == ir_tex) {
1164	 /* There's no plain shadow compare message, so we use shadow
1165	  * compare with a bias of 0.0.
1166	  */
1167	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1168	 mlen++;
1169      } else if (ir->op == ir_txb) {
1170	 ir->lod_info.bias->accept(this);
1171	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1172	 mlen++;
1173      } else {
1174	 assert(ir->op == ir_txl);
1175	 ir->lod_info.lod->accept(this);
1176	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1177	 mlen++;
1178      }
1179
1180      ir->shadow_comparitor->accept(this);
1181      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1182      mlen++;
1183   } else if (ir->op == ir_tex) {
1184      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1185	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1186	 coordinate.reg_offset++;
1187      }
1188      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1189      mlen += 3;
1190   } else if (ir->op == ir_txd) {
1191      assert(!"TXD isn't supported on gen4 yet.");
1192   } else {
1193      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1194       * instructions.  We'll need to do SIMD16 here.
1195       */
1196      assert(ir->op == ir_txb || ir->op == ir_txl);
1197
1198      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1199	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate);
1200	 coordinate.reg_offset++;
1201      }
1202
1203      /* lod/bias appears after u/v/r. */
1204      mlen += 6;
1205
1206      if (ir->op == ir_txb) {
1207	 ir->lod_info.bias->accept(this);
1208	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1209	 mlen++;
1210      } else {
1211	 ir->lod_info.lod->accept(this);
1212	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1213	 mlen++;
1214      }
1215
1216      /* The unused upper half. */
1217      mlen++;
1218
1219      /* Now, since we're doing simd16, the return is 2 interleaved
1220       * vec4s where the odd-indexed ones are junk. We'll need to move
1221       * this weirdness around to the expected layout.
1222       */
1223      simd16 = true;
1224      orig_dst = dst;
1225      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1226						       2));
1227      dst.type = BRW_REGISTER_TYPE_F;
1228   }
1229
1230   fs_inst *inst = NULL;
1231   switch (ir->op) {
1232   case ir_tex:
1233      inst = emit(FS_OPCODE_TEX, dst);
1234      break;
1235   case ir_txb:
1236      inst = emit(FS_OPCODE_TXB, dst);
1237      break;
1238   case ir_txl:
1239      inst = emit(FS_OPCODE_TXL, dst);
1240      break;
1241   case ir_txd:
1242      inst = emit(FS_OPCODE_TXD, dst);
1243      break;
1244   case ir_txf:
1245      assert(!"GLSL 1.30 features unsupported");
1246      break;
1247   }
1248   inst->base_mrf = base_mrf;
1249   inst->mlen = mlen;
1250
1251   if (simd16) {
1252      for (int i = 0; i < 4; i++) {
1253	 emit(BRW_OPCODE_MOV, orig_dst, dst);
1254	 orig_dst.reg_offset++;
1255	 dst.reg_offset += 2;
1256      }
1257   }
1258
1259   return inst;
1260}
1261
1262/* gen5's sampler has slots for u, v, r, array index, then optional
1263 * parameters like shadow comparitor or LOD bias.  If optional
1264 * parameters aren't present, those base slots are optional and don't
1265 * need to be included in the message.
1266 *
1267 * We don't fill in the unnecessary slots regardless, which may look
1268 * surprising in the disassembly.
1269 */
1270fs_inst *
1271fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1272{
1273   int mlen = 1; /* g0 header always present. */
1274   int base_mrf = 1;
1275   int reg_width = c->dispatch_width / 8;
1276
1277   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1278      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * reg_width),
1279	   coordinate);
1280      coordinate.reg_offset++;
1281   }
1282   mlen += ir->coordinate->type->vector_elements * reg_width;
1283
1284   if (ir->shadow_comparitor) {
1285      mlen = MAX2(mlen, 1 + 4 * reg_width);
1286
1287      ir->shadow_comparitor->accept(this);
1288      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1289      mlen += reg_width;
1290   }
1291
1292   fs_inst *inst = NULL;
1293   switch (ir->op) {
1294   case ir_tex:
1295      inst = emit(FS_OPCODE_TEX, dst);
1296      break;
1297   case ir_txb:
1298      ir->lod_info.bias->accept(this);
1299      mlen = MAX2(mlen, 1 + 4 * reg_width);
1300      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1301      mlen += reg_width;
1302
1303      inst = emit(FS_OPCODE_TXB, dst);
1304
1305      break;
1306   case ir_txl:
1307      ir->lod_info.lod->accept(this);
1308      mlen = MAX2(mlen, 1 + 4 * reg_width);
1309      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1310      mlen += reg_width;
1311
1312      inst = emit(FS_OPCODE_TXL, dst);
1313      break;
1314   case ir_txd:
1315   case ir_txf:
1316      assert(!"GLSL 1.30 features unsupported");
1317      break;
1318   }
1319   inst->base_mrf = base_mrf;
1320   inst->mlen = mlen;
1321
1322   if (mlen > 11) {
1323      fail("Message length >11 disallowed by hardware\n");
1324   }
1325
1326   return inst;
1327}
1328
1329void
1330fs_visitor::visit(ir_texture *ir)
1331{
1332   int sampler;
1333   fs_inst *inst = NULL;
1334
1335   ir->coordinate->accept(this);
1336   fs_reg coordinate = this->result;
1337
1338   if (ir->offset != NULL) {
1339      ir_constant *offset = ir->offset->as_constant();
1340      assert(offset != NULL);
1341
1342      signed char offsets[3];
1343      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1344	 offsets[i] = (signed char) offset->value.i[i];
1345
1346      /* Combine all three offsets into a single unsigned dword:
1347       *
1348       *    bits 11:8 - U Offset (X component)
1349       *    bits  7:4 - V Offset (Y component)
1350       *    bits  3:0 - R Offset (Z component)
1351       */
1352      unsigned offset_bits = 0;
1353      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1354	 const unsigned shift = 4 * (2 - i);
1355	 offset_bits |= (offsets[i] << shift) & (0xF << shift);
1356      }
1357
1358      /* Explicitly set up the message header by copying g0 to msg reg m1. */
1359      emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1360	   fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1361
1362      /* Then set the offset bits in DWord 2 of the message header. */
1363      emit(BRW_OPCODE_MOV,
1364	   fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1365			 BRW_REGISTER_TYPE_UD)),
1366	   fs_reg(brw_imm_uw(offset_bits)));
1367   }
1368
1369   /* Should be lowered by do_lower_texture_projection */
1370   assert(!ir->projector);
1371
1372   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1373					     ctx->Shader.CurrentFragmentProgram,
1374					     &brw->fragment_program->Base);
1375   sampler = c->fp->program.Base.SamplerUnits[sampler];
1376
1377   /* The 965 requires the EU to do the normalization of GL rectangle
1378    * texture coordinates.  We use the program parameter state
1379    * tracking to get the scaling factor.
1380    */
1381   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1382      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1383      int tokens[STATE_LENGTH] = {
1384	 STATE_INTERNAL,
1385	 STATE_TEXRECT_SCALE,
1386	 sampler,
1387	 0,
1388	 0
1389      };
1390
1391      c->prog_data.param_convert[c->prog_data.nr_params] =
1392	 PARAM_NO_CONVERT;
1393      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1394	 PARAM_NO_CONVERT;
1395
1396      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1397      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1398      GLuint index = _mesa_add_state_reference(params,
1399					       (gl_state_index *)tokens);
1400
1401      this->param_index[c->prog_data.nr_params] = index;
1402      this->param_offset[c->prog_data.nr_params] = 0;
1403      c->prog_data.nr_params++;
1404      this->param_index[c->prog_data.nr_params] = index;
1405      this->param_offset[c->prog_data.nr_params] = 1;
1406      c->prog_data.nr_params++;
1407
1408      fs_reg dst = fs_reg(this, ir->coordinate->type);
1409      fs_reg src = coordinate;
1410      coordinate = dst;
1411
1412      emit(BRW_OPCODE_MUL, dst, src, scale_x);
1413      dst.reg_offset++;
1414      src.reg_offset++;
1415      emit(BRW_OPCODE_MUL, dst, src, scale_y);
1416   }
1417
1418   /* Writemasking doesn't eliminate channels on SIMD8 texture
1419    * samples, so don't worry about them.
1420    */
1421   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1422
1423   if (intel->gen < 5) {
1424      inst = emit_texture_gen4(ir, dst, coordinate);
1425   } else {
1426      inst = emit_texture_gen5(ir, dst, coordinate);
1427   }
1428
1429   /* If there's an offset, we already set up m1.  To avoid the implied move,
1430    * use the null register.  Otherwise, we want an implied move from g0.
1431    */
1432   if (ir->offset != NULL)
1433      inst->src[0] = fs_reg(brw_null_reg());
1434   else
1435      inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1436
1437   inst->sampler = sampler;
1438
1439   this->result = dst;
1440
1441   if (ir->shadow_comparitor)
1442      inst->shadow_compare = true;
1443
1444   if (ir->type == glsl_type::float_type) {
1445      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1446      assert(ir->sampler->type->sampler_shadow);
1447   } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1448      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1449
1450      for (int i = 0; i < 4; i++) {
1451	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1452	 fs_reg l = swizzle_dst;
1453	 l.reg_offset += i;
1454
1455	 if (swiz == SWIZZLE_ZERO) {
1456	    emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1457	 } else if (swiz == SWIZZLE_ONE) {
1458	    emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1459	 } else {
1460	    fs_reg r = dst;
1461	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1462	    emit(BRW_OPCODE_MOV, l, r);
1463	 }
1464      }
1465      this->result = swizzle_dst;
1466   }
1467}
1468
1469void
1470fs_visitor::visit(ir_swizzle *ir)
1471{
1472   ir->val->accept(this);
1473   fs_reg val = this->result;
1474
1475   if (ir->type->vector_elements == 1) {
1476      this->result.reg_offset += ir->mask.x;
1477      return;
1478   }
1479
1480   fs_reg result = fs_reg(this, ir->type);
1481   this->result = result;
1482
1483   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1484      fs_reg channel = val;
1485      int swiz = 0;
1486
1487      switch (i) {
1488      case 0:
1489	 swiz = ir->mask.x;
1490	 break;
1491      case 1:
1492	 swiz = ir->mask.y;
1493	 break;
1494      case 2:
1495	 swiz = ir->mask.z;
1496	 break;
1497      case 3:
1498	 swiz = ir->mask.w;
1499	 break;
1500      }
1501
1502      channel.reg_offset += swiz;
1503      emit(BRW_OPCODE_MOV, result, channel);
1504      result.reg_offset++;
1505   }
1506}
1507
1508void
1509fs_visitor::visit(ir_discard *ir)
1510{
1511   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1512
1513   assert(ir->condition == NULL); /* FINISHME */
1514
1515   emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d);
1516   emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp);
1517   kill_emitted = true;
1518}
1519
1520void
1521fs_visitor::visit(ir_constant *ir)
1522{
1523   /* Set this->result to reg at the bottom of the function because some code
1524    * paths will cause this visitor to be applied to other fields.  This will
1525    * cause the value stored in this->result to be modified.
1526    *
1527    * Make reg constant so that it doesn't get accidentally modified along the
1528    * way.  Yes, I actually had this problem. :(
1529    */
1530   const fs_reg reg(this, ir->type);
1531   fs_reg dst_reg = reg;
1532
1533   if (ir->type->is_array()) {
1534      const unsigned size = type_size(ir->type->fields.array);
1535
1536      for (unsigned i = 0; i < ir->type->length; i++) {
1537	 ir->array_elements[i]->accept(this);
1538	 fs_reg src_reg = this->result;
1539
1540	 dst_reg.type = src_reg.type;
1541	 for (unsigned j = 0; j < size; j++) {
1542	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1543	    src_reg.reg_offset++;
1544	    dst_reg.reg_offset++;
1545	 }
1546      }
1547   } else if (ir->type->is_record()) {
1548      foreach_list(node, &ir->components) {
1549	 ir_instruction *const field = (ir_instruction *) node;
1550	 const unsigned size = type_size(field->type);
1551
1552	 field->accept(this);
1553	 fs_reg src_reg = this->result;
1554
1555	 dst_reg.type = src_reg.type;
1556	 for (unsigned j = 0; j < size; j++) {
1557	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1558	    src_reg.reg_offset++;
1559	    dst_reg.reg_offset++;
1560	 }
1561      }
1562   } else {
1563      const unsigned size = type_size(ir->type);
1564
1565      for (unsigned i = 0; i < size; i++) {
1566	 switch (ir->type->base_type) {
1567	 case GLSL_TYPE_FLOAT:
1568	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1569	    break;
1570	 case GLSL_TYPE_UINT:
1571	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1572	    break;
1573	 case GLSL_TYPE_INT:
1574	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1575	    break;
1576	 case GLSL_TYPE_BOOL:
1577	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1578	    break;
1579	 default:
1580	    assert(!"Non-float/uint/int/bool constant");
1581	 }
1582	 dst_reg.reg_offset++;
1583      }
1584   }
1585
1586   this->result = reg;
1587}
1588
1589void
1590fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1591{
1592   ir_expression *expr = ir->as_expression();
1593
1594   if (expr) {
1595      fs_reg op[2];
1596      fs_inst *inst;
1597
1598      assert(expr->get_num_operands() <= 2);
1599      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1600	 assert(expr->operands[i]->type->is_scalar());
1601
1602	 expr->operands[i]->accept(this);
1603	 op[i] = this->result;
1604      }
1605
1606      switch (expr->operation) {
1607      case ir_unop_logic_not:
1608	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1609	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1610	 break;
1611
1612      case ir_binop_logic_xor:
1613	 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1614	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1615	 break;
1616
1617      case ir_binop_logic_or:
1618	 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1619	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1620	 break;
1621
1622      case ir_binop_logic_and:
1623	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1624	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1625	 break;
1626
1627      case ir_unop_f2b:
1628	 if (intel->gen >= 6) {
1629	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1630	 } else {
1631	    inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1632	 }
1633	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1634	 break;
1635
1636      case ir_unop_i2b:
1637	 if (intel->gen >= 6) {
1638	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1639	 } else {
1640	    inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1641	 }
1642	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1643	 break;
1644
1645      case ir_binop_greater:
1646      case ir_binop_gequal:
1647      case ir_binop_less:
1648      case ir_binop_lequal:
1649      case ir_binop_equal:
1650      case ir_binop_all_equal:
1651      case ir_binop_nequal:
1652      case ir_binop_any_nequal:
1653	 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1654	 inst->conditional_mod =
1655	    brw_conditional_for_comparison(expr->operation);
1656	 break;
1657
1658      default:
1659	 assert(!"not reached");
1660	 fail("bad cond code\n");
1661	 break;
1662      }
1663      return;
1664   }
1665
1666   ir->accept(this);
1667
1668   if (intel->gen >= 6) {
1669      fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1670      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1671   } else {
1672      fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1673      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1674   }
1675}
1676
1677/**
1678 * Emit a gen6 IF statement with the comparison folded into the IF
1679 * instruction.
1680 */
1681void
1682fs_visitor::emit_if_gen6(ir_if *ir)
1683{
1684   ir_expression *expr = ir->condition->as_expression();
1685
1686   if (expr) {
1687      fs_reg op[2];
1688      fs_inst *inst;
1689      fs_reg temp;
1690
1691      assert(expr->get_num_operands() <= 2);
1692      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1693	 assert(expr->operands[i]->type->is_scalar());
1694
1695	 expr->operands[i]->accept(this);
1696	 op[i] = this->result;
1697      }
1698
1699      switch (expr->operation) {
1700      case ir_unop_logic_not:
1701	 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1702	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1703	 return;
1704
1705      case ir_binop_logic_xor:
1706	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1707	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1708	 return;
1709
1710      case ir_binop_logic_or:
1711	 temp = fs_reg(this, glsl_type::bool_type);
1712	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1713	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1714	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1715	 return;
1716
1717      case ir_binop_logic_and:
1718	 temp = fs_reg(this, glsl_type::bool_type);
1719	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1720	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1721	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1722	 return;
1723
1724      case ir_unop_f2b:
1725	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1726	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1727	 return;
1728
1729      case ir_unop_i2b:
1730	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1731	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1732	 return;
1733
1734      case ir_binop_greater:
1735      case ir_binop_gequal:
1736      case ir_binop_less:
1737      case ir_binop_lequal:
1738      case ir_binop_equal:
1739      case ir_binop_all_equal:
1740      case ir_binop_nequal:
1741      case ir_binop_any_nequal:
1742	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1743	 inst->conditional_mod =
1744	    brw_conditional_for_comparison(expr->operation);
1745	 return;
1746      default:
1747	 assert(!"not reached");
1748	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1749	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1750	 fail("bad condition\n");
1751	 return;
1752      }
1753      return;
1754   }
1755
1756   ir->condition->accept(this);
1757
1758   fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1759   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1760}
1761
1762void
1763fs_visitor::visit(ir_if *ir)
1764{
1765   fs_inst *inst;
1766
1767   if (c->dispatch_width == 16) {
1768      fail("Can't support (non-uniform) control flow on 16-wide\n");
1769   }
1770
1771   /* Don't point the annotation at the if statement, because then it plus
1772    * the then and else blocks get printed.
1773    */
1774   this->base_ir = ir->condition;
1775
1776   if (intel->gen >= 6) {
1777      emit_if_gen6(ir);
1778   } else {
1779      emit_bool_to_cond_code(ir->condition);
1780
1781      inst = emit(BRW_OPCODE_IF);
1782      inst->predicated = true;
1783   }
1784
1785   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1786      ir_instruction *ir = (ir_instruction *)iter.get();
1787      this->base_ir = ir;
1788
1789      ir->accept(this);
1790   }
1791
1792   if (!ir->else_instructions.is_empty()) {
1793      emit(BRW_OPCODE_ELSE);
1794
1795      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1796	 ir_instruction *ir = (ir_instruction *)iter.get();
1797	 this->base_ir = ir;
1798
1799	 ir->accept(this);
1800      }
1801   }
1802
1803   emit(BRW_OPCODE_ENDIF);
1804}
1805
1806void
1807fs_visitor::visit(ir_loop *ir)
1808{
1809   fs_reg counter = reg_undef;
1810
1811   if (c->dispatch_width == 16) {
1812      fail("Can't support (non-uniform) control flow on 16-wide\n");
1813   }
1814
1815   if (ir->counter) {
1816      this->base_ir = ir->counter;
1817      ir->counter->accept(this);
1818      counter = *(variable_storage(ir->counter));
1819
1820      if (ir->from) {
1821	 this->base_ir = ir->from;
1822	 ir->from->accept(this);
1823
1824	 emit(BRW_OPCODE_MOV, counter, this->result);
1825      }
1826   }
1827
1828   emit(BRW_OPCODE_DO);
1829
1830   if (ir->to) {
1831      this->base_ir = ir->to;
1832      ir->to->accept(this);
1833
1834      fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1835      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1836
1837      inst = emit(BRW_OPCODE_BREAK);
1838      inst->predicated = true;
1839   }
1840
1841   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1842      ir_instruction *ir = (ir_instruction *)iter.get();
1843
1844      this->base_ir = ir;
1845      ir->accept(this);
1846   }
1847
1848   if (ir->increment) {
1849      this->base_ir = ir->increment;
1850      ir->increment->accept(this);
1851      emit(BRW_OPCODE_ADD, counter, counter, this->result);
1852   }
1853
1854   emit(BRW_OPCODE_WHILE);
1855}
1856
1857void
1858fs_visitor::visit(ir_loop_jump *ir)
1859{
1860   switch (ir->mode) {
1861   case ir_loop_jump::jump_break:
1862      emit(BRW_OPCODE_BREAK);
1863      break;
1864   case ir_loop_jump::jump_continue:
1865      emit(BRW_OPCODE_CONTINUE);
1866      break;
1867   }
1868}
1869
1870void
1871fs_visitor::visit(ir_call *ir)
1872{
1873   assert(!"FINISHME");
1874}
1875
1876void
1877fs_visitor::visit(ir_return *ir)
1878{
1879   assert(!"FINISHME");
1880}
1881
1882void
1883fs_visitor::visit(ir_function *ir)
1884{
1885   /* Ignore function bodies other than main() -- we shouldn't see calls to
1886    * them since they should all be inlined before we get to ir_to_mesa.
1887    */
1888   if (strcmp(ir->name, "main") == 0) {
1889      const ir_function_signature *sig;
1890      exec_list empty;
1891
1892      sig = ir->matching_signature(&empty);
1893
1894      assert(sig);
1895
1896      foreach_iter(exec_list_iterator, iter, sig->body) {
1897	 ir_instruction *ir = (ir_instruction *)iter.get();
1898	 this->base_ir = ir;
1899
1900	 ir->accept(this);
1901      }
1902   }
1903}
1904
1905void
1906fs_visitor::visit(ir_function_signature *ir)
1907{
1908   assert(!"not reached");
1909   (void)ir;
1910}
1911
1912fs_inst *
1913fs_visitor::emit(fs_inst inst)
1914{
1915   fs_inst *list_inst = new(mem_ctx) fs_inst;
1916   *list_inst = inst;
1917
1918   if (force_uncompressed_stack > 0)
1919      list_inst->force_uncompressed = true;
1920   else if (force_sechalf_stack > 0)
1921      list_inst->force_sechalf = true;
1922
1923   list_inst->annotation = this->current_annotation;
1924   list_inst->ir = this->base_ir;
1925
1926   this->instructions.push_tail(list_inst);
1927
1928   return list_inst;
1929}
1930
1931/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1932void
1933fs_visitor::emit_dummy_fs()
1934{
1935   /* Everyone's favorite color. */
1936   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
1937   emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
1938   emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
1939   emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
1940
1941   fs_inst *write;
1942   write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1943   write->base_mrf = 0;
1944}
1945
1946/* The register location here is relative to the start of the URB
1947 * data.  It will get adjusted to be a real location before
1948 * generate_code() time.
1949 */
1950struct brw_reg
1951fs_visitor::interp_reg(int location, int channel)
1952{
1953   int regnr = urb_setup[location] * 2 + channel / 2;
1954   int stride = (channel & 1) * 4;
1955
1956   assert(urb_setup[location] != -1);
1957
1958   return brw_vec1_grf(regnr, stride);
1959}
1960
1961/** Emits the interpolation for the varying inputs. */
1962void
1963fs_visitor::emit_interpolation_setup_gen4()
1964{
1965   this->current_annotation = "compute pixel centers";
1966   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1967   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1968   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1969   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1970
1971   emit(FS_OPCODE_PIXEL_X, this->pixel_x);
1972   emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
1973
1974   this->current_annotation = "compute pixel deltas from v0";
1975   if (brw->has_pln) {
1976      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1977      this->delta_y = this->delta_x;
1978      this->delta_y.reg_offset++;
1979   } else {
1980      this->delta_x = fs_reg(this, glsl_type::float_type);
1981      this->delta_y = fs_reg(this, glsl_type::float_type);
1982   }
1983   emit(BRW_OPCODE_ADD, this->delta_x,
1984	this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1985   emit(BRW_OPCODE_ADD, this->delta_y,
1986	this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1987
1988   this->current_annotation = "compute pos.w and 1/pos.w";
1989   /* Compute wpos.w.  It's always in our setup, since it's needed to
1990    * interpolate the other attributes.
1991    */
1992   this->wpos_w = fs_reg(this, glsl_type::float_type);
1993   emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1994	interp_reg(FRAG_ATTRIB_WPOS, 3));
1995   /* Compute the pixel 1/W value from wpos.w. */
1996   this->pixel_w = fs_reg(this, glsl_type::float_type);
1997   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1998   this->current_annotation = NULL;
1999}
2000
2001/** Emits the interpolation for the varying inputs. */
2002void
2003fs_visitor::emit_interpolation_setup_gen6()
2004{
2005   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2006
2007   /* If the pixel centers end up used, the setup is the same as for gen4. */
2008   this->current_annotation = "compute pixel centers";
2009   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2010   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2011   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2012   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2013   emit(BRW_OPCODE_ADD,
2014	int_pixel_x,
2015	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2016	fs_reg(brw_imm_v(0x10101010)));
2017   emit(BRW_OPCODE_ADD,
2018	int_pixel_y,
2019	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2020	fs_reg(brw_imm_v(0x11001100)));
2021
2022   /* As of gen6, we can no longer mix float and int sources.  We have
2023    * to turn the integer pixel centers into floats for their actual
2024    * use.
2025    */
2026   this->pixel_x = fs_reg(this, glsl_type::float_type);
2027   this->pixel_y = fs_reg(this, glsl_type::float_type);
2028   emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
2029   emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
2030
2031   this->current_annotation = "compute pos.w";
2032   this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2033   this->wpos_w = fs_reg(this, glsl_type::float_type);
2034   emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w);
2035
2036   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2037   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2038
2039   this->current_annotation = NULL;
2040}
2041
2042void
2043fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
2044{
2045   int reg_width = c->dispatch_width / 8;
2046
2047   if (c->dispatch_width == 8 || intel->gen == 6) {
2048      /* SIMD8 write looks like:
2049       * m + 0: r0
2050       * m + 1: r1
2051       * m + 2: g0
2052       * m + 3: g1
2053       *
2054       * gen6 SIMD16 DP write looks like:
2055       * m + 0: r0
2056       * m + 1: r1
2057       * m + 2: g0
2058       * m + 3: g1
2059       * m + 4: b0
2060       * m + 5: b1
2061       * m + 6: a0
2062       * m + 7: a1
2063       */
2064      emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width),
2065	   color);
2066   } else {
2067      /* pre-gen6 SIMD16 single source DP write looks like:
2068       * m + 0: r0
2069       * m + 1: g0
2070       * m + 2: b0
2071       * m + 3: a0
2072       * m + 4: r1
2073       * m + 5: g1
2074       * m + 6: b1
2075       * m + 7: a1
2076       */
2077      if (brw->has_compr4) {
2078	 /* By setting the high bit of the MRF register number, we
2079	  * indicate that we want COMPR4 mode - instead of doing the
2080	  * usual destination + 1 for the second half we get
2081	  * destination + 4.
2082	  */
2083	 emit(BRW_OPCODE_MOV,
2084	      fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color);
2085      } else {
2086	 push_force_uncompressed();
2087	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color);
2088	 pop_force_uncompressed();
2089
2090	 push_force_sechalf();
2091	 color.sechalf = true;
2092	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color);
2093	 pop_force_sechalf();
2094	 color.sechalf = false;
2095      }
2096   }
2097}
2098
2099void
2100fs_visitor::emit_fb_writes()
2101{
2102   this->current_annotation = "FB write header";
2103   GLboolean header_present = GL_TRUE;
2104   int nr = 0;
2105   int reg_width = c->dispatch_width / 8;
2106
2107   if (intel->gen >= 6 &&
2108       !this->kill_emitted &&
2109       c->key.nr_color_regions == 1) {
2110      header_present = false;
2111   }
2112
2113   if (header_present) {
2114      /* m0, m1 header */
2115      nr += 2;
2116   }
2117
2118   if (c->aa_dest_stencil_reg) {
2119      push_force_uncompressed();
2120      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2121	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2122      pop_force_uncompressed();
2123   }
2124
2125   /* Reserve space for color. It'll be filled in per MRT below. */
2126   int color_mrf = nr;
2127   nr += 4 * reg_width;
2128
2129   if (c->source_depth_to_render_target) {
2130      if (intel->gen == 6 && c->dispatch_width == 16) {
2131	 /* For outputting oDepth on gen6, SIMD8 writes have to be
2132	  * used.  This would require 8-wide moves of each half to
2133	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
2134	  * Just bail on doing so for now.
2135	  */
2136	 fail("Missing support for simd16 depth writes on gen6\n");
2137      }
2138
2139      if (c->computes_depth) {
2140	 /* Hand over gl_FragDepth. */
2141	 assert(this->frag_depth);
2142	 fs_reg depth = *(variable_storage(this->frag_depth));
2143
2144	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
2145      } else {
2146	 /* Pass through the payload depth. */
2147	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2148	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2149      }
2150      nr += reg_width;
2151   }
2152
2153   if (c->dest_depth_reg) {
2154      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2155	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2156      nr += reg_width;
2157   }
2158
2159   fs_reg color = reg_undef;
2160   if (this->frag_color)
2161      color = *(variable_storage(this->frag_color));
2162   else if (this->frag_data) {
2163      color = *(variable_storage(this->frag_data));
2164      color.type = BRW_REGISTER_TYPE_F;
2165   }
2166
2167   for (int target = 0; target < c->key.nr_color_regions; target++) {
2168      this->current_annotation = ralloc_asprintf(this->mem_ctx,
2169						 "FB write target %d",
2170						 target);
2171      if (this->frag_color || this->frag_data) {
2172	 for (int i = 0; i < 4; i++) {
2173	    emit_color_write(i, color_mrf, color);
2174	    color.reg_offset++;
2175	 }
2176      }
2177
2178      if (this->frag_color)
2179	 color.reg_offset -= 4;
2180
2181      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2182      inst->target = target;
2183      inst->base_mrf = 0;
2184      inst->mlen = nr;
2185      if (target == c->key.nr_color_regions - 1)
2186	 inst->eot = true;
2187      inst->header_present = header_present;
2188   }
2189
2190   if (c->key.nr_color_regions == 0) {
2191      if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2192	 /* If the alpha test is enabled but there's no color buffer,
2193	  * we still need to send alpha out the pipeline to our null
2194	  * renderbuffer.
2195	  */
2196	 color.reg_offset += 3;
2197	 emit_color_write(3, color_mrf, color);
2198      }
2199
2200      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2201      inst->base_mrf = 0;
2202      inst->mlen = nr;
2203      inst->eot = true;
2204      inst->header_present = header_present;
2205   }
2206
2207   this->current_annotation = NULL;
2208}
2209
2210void
2211fs_visitor::generate_fb_write(fs_inst *inst)
2212{
2213   GLboolean eot = inst->eot;
2214   struct brw_reg implied_header;
2215
2216   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2217    * move, here's g1.
2218    */
2219   brw_push_insn_state(p);
2220   brw_set_mask_control(p, BRW_MASK_DISABLE);
2221   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2222
2223   if (inst->header_present) {
2224      if (intel->gen >= 6) {
2225	 brw_MOV(p,
2226		 brw_message_reg(inst->base_mrf),
2227		 brw_vec8_grf(0, 0));
2228
2229	 if (inst->target > 0) {
2230	    /* Set the render target index for choosing BLEND_STATE. */
2231	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2232			      BRW_REGISTER_TYPE_UD),
2233		    brw_imm_ud(inst->target));
2234	 }
2235
2236	 /* Clear viewport index, render target array index. */
2237	 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2238			   BRW_REGISTER_TYPE_UD),
2239		 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2240		 brw_imm_ud(0xf7ff));
2241
2242	 implied_header = brw_null_reg();
2243      } else {
2244	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2245      }
2246
2247      brw_MOV(p,
2248	      brw_message_reg(inst->base_mrf + 1),
2249	      brw_vec8_grf(1, 0));
2250   } else {
2251      implied_header = brw_null_reg();
2252   }
2253
2254   brw_pop_insn_state(p);
2255
2256   brw_fb_WRITE(p,
2257		c->dispatch_width,
2258		inst->base_mrf,
2259		implied_header,
2260		inst->target,
2261		inst->mlen,
2262		0,
2263		eot,
2264		inst->header_present);
2265}
2266
2267/* Computes the integer pixel x,y values from the origin.
2268 *
2269 * This is the basis of gl_FragCoord computation, but is also used
2270 * pre-gen6 for computing the deltas from v0 for computing
2271 * interpolation.
2272 */
2273void
2274fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
2275{
2276   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2277   struct brw_reg src;
2278   struct brw_reg deltas;
2279
2280   if (is_x) {
2281      src = stride(suboffset(g1_uw, 4), 2, 4, 0);
2282      deltas = brw_imm_v(0x10101010);
2283   } else {
2284      src = stride(suboffset(g1_uw, 5), 2, 4, 0);
2285      deltas = brw_imm_v(0x11001100);
2286   }
2287
2288   if (c->dispatch_width == 16) {
2289      dst = vec16(dst);
2290   }
2291
2292   /* We do this 8 or 16-wide, but since the destination is UW we
2293    * don't do compression in the 16-wide case.
2294    */
2295   brw_push_insn_state(p);
2296   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2297   brw_ADD(p, dst, src, deltas);
2298   brw_pop_insn_state(p);
2299}
2300
2301void
2302fs_visitor::generate_linterp(fs_inst *inst,
2303			     struct brw_reg dst, struct brw_reg *src)
2304{
2305   struct brw_reg delta_x = src[0];
2306   struct brw_reg delta_y = src[1];
2307   struct brw_reg interp = src[2];
2308
2309   if (brw->has_pln &&
2310       delta_y.nr == delta_x.nr + 1 &&
2311       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2312      brw_PLN(p, dst, interp, delta_x);
2313   } else {
2314      brw_LINE(p, brw_null_reg(), interp, delta_x);
2315      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2316   }
2317}
2318
2319void
2320fs_visitor::generate_math(fs_inst *inst,
2321			  struct brw_reg dst, struct brw_reg *src)
2322{
2323   int op;
2324
2325   switch (inst->opcode) {
2326   case FS_OPCODE_RCP:
2327      op = BRW_MATH_FUNCTION_INV;
2328      break;
2329   case FS_OPCODE_RSQ:
2330      op = BRW_MATH_FUNCTION_RSQ;
2331      break;
2332   case FS_OPCODE_SQRT:
2333      op = BRW_MATH_FUNCTION_SQRT;
2334      break;
2335   case FS_OPCODE_EXP2:
2336      op = BRW_MATH_FUNCTION_EXP;
2337      break;
2338   case FS_OPCODE_LOG2:
2339      op = BRW_MATH_FUNCTION_LOG;
2340      break;
2341   case FS_OPCODE_POW:
2342      op = BRW_MATH_FUNCTION_POW;
2343      break;
2344   case FS_OPCODE_SIN:
2345      op = BRW_MATH_FUNCTION_SIN;
2346      break;
2347   case FS_OPCODE_COS:
2348      op = BRW_MATH_FUNCTION_COS;
2349      break;
2350   default:
2351      assert(!"not reached: unknown math function");
2352      op = 0;
2353      break;
2354   }
2355
2356   if (intel->gen >= 6) {
2357      assert(inst->mlen == 0);
2358
2359      if (inst->opcode == FS_OPCODE_POW) {
2360	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2361	 brw_math2(p, dst, op, src[0], src[1]);
2362
2363	 if (c->dispatch_width == 16) {
2364	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2365	    brw_math2(p, sechalf(dst), op, sechalf(src[0]), sechalf(src[1]));
2366	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2367	 }
2368      } else {
2369	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2370	 brw_math(p, dst,
2371		  op,
2372		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2373		  BRW_MATH_SATURATE_NONE,
2374		  0, src[0],
2375		  BRW_MATH_DATA_VECTOR,
2376		  BRW_MATH_PRECISION_FULL);
2377
2378	 if (c->dispatch_width == 16) {
2379	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2380	    brw_math(p, sechalf(dst),
2381		     op,
2382		     inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2383		     BRW_MATH_SATURATE_NONE,
2384		     0, sechalf(src[0]),
2385		     BRW_MATH_DATA_VECTOR,
2386		     BRW_MATH_PRECISION_FULL);
2387	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2388	 }
2389      }
2390   } else /* gen <= 5 */{
2391      assert(inst->mlen >= 1);
2392
2393      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2394      brw_math(p, dst,
2395	       op,
2396	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2397	       BRW_MATH_SATURATE_NONE,
2398	       inst->base_mrf, src[0],
2399	       BRW_MATH_DATA_VECTOR,
2400	       BRW_MATH_PRECISION_FULL);
2401
2402      if (c->dispatch_width == 16) {
2403	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2404	 brw_math(p, sechalf(dst),
2405		  op,
2406		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2407		  BRW_MATH_SATURATE_NONE,
2408		  inst->base_mrf + 1, sechalf(src[0]),
2409		  BRW_MATH_DATA_VECTOR,
2410		  BRW_MATH_PRECISION_FULL);
2411
2412	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2413      }
2414   }
2415}
2416
2417void
2418fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2419{
2420   int msg_type = -1;
2421   int rlen = 4;
2422   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2423
2424   if (c->dispatch_width == 16) {
2425      rlen = 8;
2426      dst = vec16(dst);
2427      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2428   }
2429
2430   if (intel->gen >= 5) {
2431      switch (inst->opcode) {
2432      case FS_OPCODE_TEX:
2433	 if (inst->shadow_compare) {
2434	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
2435	 } else {
2436	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
2437	 }
2438	 break;
2439      case FS_OPCODE_TXB:
2440	 if (inst->shadow_compare) {
2441	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
2442	 } else {
2443	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
2444	 }
2445	 break;
2446      case FS_OPCODE_TXL:
2447	 if (inst->shadow_compare) {
2448	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
2449	 } else {
2450	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
2451	 }
2452	 break;
2453      case FS_OPCODE_TXD:
2454	 assert(!"TXD isn't supported on gen5+ yet.");
2455	 break;
2456      }
2457   } else {
2458      switch (inst->opcode) {
2459      case FS_OPCODE_TEX:
2460	 /* Note that G45 and older determines shadow compare and dispatch width
2461	  * from message length for most messages.
2462	  */
2463	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2464	 if (inst->shadow_compare) {
2465	    assert(inst->mlen == 6);
2466	 } else {
2467	    assert(inst->mlen <= 4);
2468	 }
2469	 break;
2470      case FS_OPCODE_TXB:
2471	 if (inst->shadow_compare) {
2472	    assert(inst->mlen == 6);
2473	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
2474	 } else {
2475	    assert(inst->mlen == 9);
2476	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2477	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2478	 }
2479	 break;
2480      case FS_OPCODE_TXL:
2481	 if (inst->shadow_compare) {
2482	    assert(inst->mlen == 6);
2483	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
2484	 } else {
2485	    assert(inst->mlen == 9);
2486	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
2487	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2488	 }
2489	 break;
2490      case FS_OPCODE_TXD:
2491	 assert(!"TXD isn't supported on gen4 yet.");
2492	 break;
2493      }
2494   }
2495   assert(msg_type != -1);
2496
2497   brw_SAMPLE(p,
2498	      retype(dst, BRW_REGISTER_TYPE_UW),
2499	      inst->base_mrf,
2500	      src,
2501              SURF_INDEX_TEXTURE(inst->sampler),
2502	      inst->sampler,
2503	      WRITEMASK_XYZW,
2504	      msg_type,
2505	      rlen,
2506	      inst->mlen,
2507	      0,
2508	      1,
2509	      simd_mode);
2510}
2511
2512
2513/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2514 * looking like:
2515 *
2516 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2517 *
2518 * and we're trying to produce:
2519 *
2520 *           DDX                     DDY
2521 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2522 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2523 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2524 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2525 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2526 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2527 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2528 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2529 *
2530 * and add another set of two more subspans if in 16-pixel dispatch mode.
2531 *
2532 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2533 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2534 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2535 * between each other.  We could probably do it like ddx and swizzle the right
2536 * order later, but bail for now and just produce
2537 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2538 */
2539void
2540fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2541{
2542   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2543				 BRW_REGISTER_TYPE_F,
2544				 BRW_VERTICAL_STRIDE_2,
2545				 BRW_WIDTH_2,
2546				 BRW_HORIZONTAL_STRIDE_0,
2547				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2548   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2549				 BRW_REGISTER_TYPE_F,
2550				 BRW_VERTICAL_STRIDE_2,
2551				 BRW_WIDTH_2,
2552				 BRW_HORIZONTAL_STRIDE_0,
2553				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2554   brw_ADD(p, dst, src0, negate(src1));
2555}
2556
2557void
2558fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2559{
2560   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2561				 BRW_REGISTER_TYPE_F,
2562				 BRW_VERTICAL_STRIDE_4,
2563				 BRW_WIDTH_4,
2564				 BRW_HORIZONTAL_STRIDE_0,
2565				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2566   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2567				 BRW_REGISTER_TYPE_F,
2568				 BRW_VERTICAL_STRIDE_4,
2569				 BRW_WIDTH_4,
2570				 BRW_HORIZONTAL_STRIDE_0,
2571				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2572   brw_ADD(p, dst, src0, negate(src1));
2573}
2574
2575void
2576fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2577{
2578   if (intel->gen >= 6) {
2579      /* Gen6 no longer has the mask reg for us to just read the
2580       * active channels from.  However, cmp updates just the channels
2581       * of the flag reg that are enabled, so we can get at the
2582       * channel enables that way.  In this step, make a reg of ones
2583       * we'll compare to.
2584       */
2585      brw_MOV(p, mask, brw_imm_ud(1));
2586   } else {
2587      brw_push_insn_state(p);
2588      brw_set_mask_control(p, BRW_MASK_DISABLE);
2589      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2590      brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2591      brw_pop_insn_state(p);
2592   }
2593}
2594
2595void
2596fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2597{
2598   if (intel->gen >= 6) {
2599      struct brw_reg f0 = brw_flag_reg();
2600      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2601
2602      brw_push_insn_state(p);
2603      brw_set_mask_control(p, BRW_MASK_DISABLE);
2604      brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2605      brw_pop_insn_state(p);
2606
2607      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2608	      BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2609      /* Undo CMP's whacking of predication*/
2610      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2611
2612      brw_push_insn_state(p);
2613      brw_set_mask_control(p, BRW_MASK_DISABLE);
2614      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2615      brw_AND(p, g1, f0, g1);
2616      brw_pop_insn_state(p);
2617   } else {
2618      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2619
2620      mask = brw_uw1_reg(mask.file, mask.nr, 0);
2621
2622      brw_push_insn_state(p);
2623      brw_set_mask_control(p, BRW_MASK_DISABLE);
2624      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2625      brw_AND(p, g0, mask, g0);
2626      brw_pop_insn_state(p);
2627   }
2628}
2629
2630void
2631fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2632{
2633   assert(inst->mlen != 0);
2634
2635   brw_MOV(p,
2636	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2637	   retype(src, BRW_REGISTER_TYPE_UD));
2638   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2639				 inst->offset);
2640}
2641
2642void
2643fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2644{
2645   assert(inst->mlen != 0);
2646
2647   /* Clear any post destination dependencies that would be ignored by
2648    * the block read.  See the B-Spec for pre-gen5 send instruction.
2649    *
2650    * This could use a better solution, since texture sampling and
2651    * math reads could potentially run into it as well -- anywhere
2652    * that we have a SEND with a destination that is a register that
2653    * was written but not read within the last N instructions (what's
2654    * N?  unsure).  This is rare because of dead code elimination, but
2655    * not impossible.
2656    */
2657   if (intel->gen == 4 && !intel->is_g4x)
2658      brw_MOV(p, brw_null_reg(), dst);
2659
2660   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2661				inst->offset);
2662
2663   if (intel->gen == 4 && !intel->is_g4x) {
2664      /* gen4 errata: destination from a send can't be used as a
2665       * destination until it's been read.  Just read it so we don't
2666       * have to worry.
2667       */
2668      brw_MOV(p, brw_null_reg(), dst);
2669   }
2670}
2671
2672
2673void
2674fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2675{
2676   assert(inst->mlen != 0);
2677
2678   /* Clear any post destination dependencies that would be ignored by
2679    * the block read.  See the B-Spec for pre-gen5 send instruction.
2680    *
2681    * This could use a better solution, since texture sampling and
2682    * math reads could potentially run into it as well -- anywhere
2683    * that we have a SEND with a destination that is a register that
2684    * was written but not read within the last N instructions (what's
2685    * N?  unsure).  This is rare because of dead code elimination, but
2686    * not impossible.
2687    */
2688   if (intel->gen == 4 && !intel->is_g4x)
2689      brw_MOV(p, brw_null_reg(), dst);
2690
2691   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2692			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2693
2694   if (intel->gen == 4 && !intel->is_g4x) {
2695      /* gen4 errata: destination from a send can't be used as a
2696       * destination until it's been read.  Just read it so we don't
2697       * have to worry.
2698       */
2699      brw_MOV(p, brw_null_reg(), dst);
2700   }
2701}
2702
2703/**
2704 * To be called after the last _mesa_add_state_reference() call, to
2705 * set up prog_data.param[] for assign_curb_setup() and
2706 * setup_pull_constants().
2707 */
2708void
2709fs_visitor::setup_paramvalues_refs()
2710{
2711   /* Set up the pointers to ParamValues now that that array is finalized. */
2712   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2713      c->prog_data.param[i] =
2714	 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2715	 this->param_offset[i];
2716   }
2717}
2718
2719void
2720fs_visitor::assign_curb_setup()
2721{
2722   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2723   if (c->dispatch_width == 8) {
2724      c->prog_data.first_curbe_grf = c->nr_payload_regs;
2725   } else {
2726      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
2727   }
2728
2729   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2730   foreach_iter(exec_list_iterator, iter, this->instructions) {
2731      fs_inst *inst = (fs_inst *)iter.get();
2732
2733      for (unsigned int i = 0; i < 3; i++) {
2734	 if (inst->src[i].file == UNIFORM) {
2735	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2736	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
2737						  constant_nr / 8,
2738						  constant_nr % 8);
2739
2740	    inst->src[i].file = FIXED_HW_REG;
2741	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2742	 }
2743      }
2744   }
2745}
2746
2747void
2748fs_visitor::calculate_urb_setup()
2749{
2750   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2751      urb_setup[i] = -1;
2752   }
2753
2754   int urb_next = 0;
2755   /* Figure out where each of the incoming setup attributes lands. */
2756   if (intel->gen >= 6) {
2757      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2758	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2759	    urb_setup[i] = urb_next++;
2760	 }
2761      }
2762   } else {
2763      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2764      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2765	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2766	    int fp_index;
2767
2768	    if (i >= VERT_RESULT_VAR0)
2769	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2770	    else if (i <= VERT_RESULT_TEX7)
2771	       fp_index = i;
2772	    else
2773	       fp_index = -1;
2774
2775	    if (fp_index >= 0)
2776	       urb_setup[fp_index] = urb_next++;
2777	 }
2778      }
2779   }
2780
2781   /* Each attribute is 4 setup channels, each of which is half a reg. */
2782   c->prog_data.urb_read_length = urb_next * 2;
2783}
2784
2785void
2786fs_visitor::assign_urb_setup()
2787{
2788   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
2789
2790   /* Offset all the urb_setup[] index by the actual position of the
2791    * setup regs, now that the location of the constants has been chosen.
2792    */
2793   foreach_iter(exec_list_iterator, iter, this->instructions) {
2794      fs_inst *inst = (fs_inst *)iter.get();
2795
2796      if (inst->opcode == FS_OPCODE_LINTERP) {
2797	 assert(inst->src[2].file == FIXED_HW_REG);
2798	 inst->src[2].fixed_hw_reg.nr += urb_start;
2799      }
2800
2801      if (inst->opcode == FS_OPCODE_CINTERP) {
2802	 assert(inst->src[0].file == FIXED_HW_REG);
2803	 inst->src[0].fixed_hw_reg.nr += urb_start;
2804      }
2805   }
2806
2807   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2808}
2809
2810/**
2811 * Split large virtual GRFs into separate components if we can.
2812 *
2813 * This is mostly duplicated with what brw_fs_vector_splitting does,
2814 * but that's really conservative because it's afraid of doing
2815 * splitting that doesn't result in real progress after the rest of
2816 * the optimization phases, which would cause infinite looping in
2817 * optimization.  We can do it once here, safely.  This also has the
2818 * opportunity to split interpolated values, or maybe even uniforms,
2819 * which we don't have at the IR level.
2820 *
2821 * We want to split, because virtual GRFs are what we register
2822 * allocate and spill (due to contiguousness requirements for some
2823 * instructions), and they're what we naturally generate in the
2824 * codegen process, but most virtual GRFs don't actually need to be
2825 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2826 * live intervals and better dead code elimination and coalescing.
2827 */
2828void
2829fs_visitor::split_virtual_grfs()
2830{
2831   int num_vars = this->virtual_grf_next;
2832   bool split_grf[num_vars];
2833   int new_virtual_grf[num_vars];
2834
2835   /* Try to split anything > 0 sized. */
2836   for (int i = 0; i < num_vars; i++) {
2837      if (this->virtual_grf_sizes[i] != 1)
2838	 split_grf[i] = true;
2839      else
2840	 split_grf[i] = false;
2841   }
2842
2843   if (brw->has_pln) {
2844      /* PLN opcodes rely on the delta_xy being contiguous. */
2845      split_grf[this->delta_x.reg] = false;
2846   }
2847
2848   foreach_iter(exec_list_iterator, iter, this->instructions) {
2849      fs_inst *inst = (fs_inst *)iter.get();
2850
2851      /* Texturing produces 4 contiguous registers, so no splitting. */
2852      if (inst->is_tex()) {
2853	 split_grf[inst->dst.reg] = false;
2854      }
2855   }
2856
2857   /* Allocate new space for split regs.  Note that the virtual
2858    * numbers will be contiguous.
2859    */
2860   for (int i = 0; i < num_vars; i++) {
2861      if (split_grf[i]) {
2862	 new_virtual_grf[i] = virtual_grf_alloc(1);
2863	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2864	    int reg = virtual_grf_alloc(1);
2865	    assert(reg == new_virtual_grf[i] + j - 1);
2866	    (void) reg;
2867	 }
2868	 this->virtual_grf_sizes[i] = 1;
2869      }
2870   }
2871
2872   foreach_iter(exec_list_iterator, iter, this->instructions) {
2873      fs_inst *inst = (fs_inst *)iter.get();
2874
2875      if (inst->dst.file == GRF &&
2876	  split_grf[inst->dst.reg] &&
2877	  inst->dst.reg_offset != 0) {
2878	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2879			  inst->dst.reg_offset - 1);
2880	 inst->dst.reg_offset = 0;
2881      }
2882      for (int i = 0; i < 3; i++) {
2883	 if (inst->src[i].file == GRF &&
2884	     split_grf[inst->src[i].reg] &&
2885	     inst->src[i].reg_offset != 0) {
2886	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2887				inst->src[i].reg_offset - 1);
2888	    inst->src[i].reg_offset = 0;
2889	 }
2890      }
2891   }
2892   this->live_intervals_valid = false;
2893}
2894
2895/**
2896 * Choose accesses from the UNIFORM file to demote to using the pull
2897 * constant buffer.
2898 *
2899 * We allow a fragment shader to have more than the specified minimum
2900 * maximum number of fragment shader uniform components (64).  If
2901 * there are too many of these, they'd fill up all of register space.
2902 * So, this will push some of them out to the pull constant buffer and
2903 * update the program to load them.
2904 */
2905void
2906fs_visitor::setup_pull_constants()
2907{
2908   /* Only allow 16 registers (128 uniform components) as push constants. */
2909   unsigned int max_uniform_components = 16 * 8;
2910   if (c->prog_data.nr_params <= max_uniform_components)
2911      return;
2912
2913   /* Just demote the end of the list.  We could probably do better
2914    * here, demoting things that are rarely used in the program first.
2915    */
2916   int pull_uniform_base = max_uniform_components;
2917   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2918
2919   foreach_iter(exec_list_iterator, iter, this->instructions) {
2920      fs_inst *inst = (fs_inst *)iter.get();
2921
2922      for (int i = 0; i < 3; i++) {
2923	 if (inst->src[i].file != UNIFORM)
2924	    continue;
2925
2926	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2927	 if (uniform_nr < pull_uniform_base)
2928	    continue;
2929
2930	 fs_reg dst = fs_reg(this, glsl_type::float_type);
2931	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2932					      dst);
2933	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2934	 pull->ir = inst->ir;
2935	 pull->annotation = inst->annotation;
2936	 pull->base_mrf = 14;
2937	 pull->mlen = 1;
2938
2939	 inst->insert_before(pull);
2940
2941	 inst->src[i].file = GRF;
2942	 inst->src[i].reg = dst.reg;
2943	 inst->src[i].reg_offset = 0;
2944	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2945      }
2946   }
2947
2948   for (int i = 0; i < pull_uniform_count; i++) {
2949      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2950      c->prog_data.pull_param_convert[i] =
2951	 c->prog_data.param_convert[pull_uniform_base + i];
2952   }
2953   c->prog_data.nr_params -= pull_uniform_count;
2954   c->prog_data.nr_pull_params = pull_uniform_count;
2955}
2956
2957void
2958fs_visitor::calculate_live_intervals()
2959{
2960   int num_vars = this->virtual_grf_next;
2961   int *def = ralloc_array(mem_ctx, int, num_vars);
2962   int *use = ralloc_array(mem_ctx, int, num_vars);
2963   int loop_depth = 0;
2964   int loop_start = 0;
2965   int bb_header_ip = 0;
2966
2967   if (this->live_intervals_valid)
2968      return;
2969
2970   for (int i = 0; i < num_vars; i++) {
2971      def[i] = MAX_INSTRUCTION;
2972      use[i] = -1;
2973   }
2974
2975   int ip = 0;
2976   foreach_iter(exec_list_iterator, iter, this->instructions) {
2977      fs_inst *inst = (fs_inst *)iter.get();
2978
2979      if (inst->opcode == BRW_OPCODE_DO) {
2980	 if (loop_depth++ == 0)
2981	    loop_start = ip;
2982      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2983	 loop_depth--;
2984
2985	 if (loop_depth == 0) {
2986	    /* Patches up the use of vars marked for being live across
2987	     * the whole loop.
2988	     */
2989	    for (int i = 0; i < num_vars; i++) {
2990	       if (use[i] == loop_start) {
2991		  use[i] = ip;
2992	       }
2993	    }
2994	 }
2995      } else {
2996	 for (unsigned int i = 0; i < 3; i++) {
2997	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2998	       int reg = inst->src[i].reg;
2999
3000	       if (!loop_depth) {
3001		  use[reg] = ip;
3002	       } else {
3003		  def[reg] = MIN2(loop_start, def[reg]);
3004		  use[reg] = loop_start;
3005
3006		  /* Nobody else is going to go smash our start to
3007		   * later in the loop now, because def[reg] now
3008		   * points before the bb header.
3009		   */
3010	       }
3011	    }
3012	 }
3013	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
3014	    int reg = inst->dst.reg;
3015
3016	    if (!loop_depth) {
3017	       def[reg] = MIN2(def[reg], ip);
3018	    } else {
3019	       def[reg] = MIN2(def[reg], loop_start);
3020	    }
3021	 }
3022      }
3023
3024      ip++;
3025
3026      /* Set the basic block header IP.  This is used for determining
3027       * if a complete def of single-register virtual GRF in a loop
3028       * dominates a use in the same basic block.  It's a quick way to
3029       * reduce the live interval range of most register used in a
3030       * loop.
3031       */
3032      if (inst->opcode == BRW_OPCODE_IF ||
3033	  inst->opcode == BRW_OPCODE_ELSE ||
3034	  inst->opcode == BRW_OPCODE_ENDIF ||
3035	  inst->opcode == BRW_OPCODE_DO ||
3036	  inst->opcode == BRW_OPCODE_WHILE ||
3037	  inst->opcode == BRW_OPCODE_BREAK ||
3038	  inst->opcode == BRW_OPCODE_CONTINUE) {
3039	 bb_header_ip = ip;
3040      }
3041   }
3042
3043   ralloc_free(this->virtual_grf_def);
3044   ralloc_free(this->virtual_grf_use);
3045   this->virtual_grf_def = def;
3046   this->virtual_grf_use = use;
3047
3048   this->live_intervals_valid = true;
3049}
3050
3051/**
3052 * Attempts to move immediate constants into the immediate
3053 * constant slot of following instructions.
3054 *
3055 * Immediate constants are a bit tricky -- they have to be in the last
3056 * operand slot, you can't do abs/negate on them,
3057 */
3058
3059bool
3060fs_visitor::propagate_constants()
3061{
3062   bool progress = false;
3063
3064   /* Need to update the MRF tracking for compressed instructions. */
3065   if (c->dispatch_width == 16)
3066      return false;
3067
3068   calculate_live_intervals();
3069
3070   foreach_iter(exec_list_iterator, iter, this->instructions) {
3071      fs_inst *inst = (fs_inst *)iter.get();
3072
3073      if (inst->opcode != BRW_OPCODE_MOV ||
3074	  inst->predicated ||
3075	  inst->dst.file != GRF || inst->src[0].file != IMM ||
3076	  inst->dst.type != inst->src[0].type)
3077	 continue;
3078
3079      /* Don't bother with cases where we should have had the
3080       * operation on the constant folded in GLSL already.
3081       */
3082      if (inst->saturate)
3083	 continue;
3084
3085      /* Found a move of a constant to a GRF.  Find anything else using the GRF
3086       * before it's written, and replace it with the constant if we can.
3087       */
3088      exec_list_iterator scan_iter = iter;
3089      scan_iter.next();
3090      for (; scan_iter.has_next(); scan_iter.next()) {
3091	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3092
3093	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3094	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3095	     scan_inst->opcode == BRW_OPCODE_ELSE ||
3096	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3097	    break;
3098	 }
3099
3100	 for (int i = 2; i >= 0; i--) {
3101	    if (scan_inst->src[i].file != GRF ||
3102		scan_inst->src[i].reg != inst->dst.reg ||
3103		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
3104	       continue;
3105
3106	    /* Don't bother with cases where we should have had the
3107	     * operation on the constant folded in GLSL already.
3108	     */
3109	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
3110	       continue;
3111
3112	    switch (scan_inst->opcode) {
3113	    case BRW_OPCODE_MOV:
3114	       scan_inst->src[i] = inst->src[0];
3115	       progress = true;
3116	       break;
3117
3118	    case BRW_OPCODE_MUL:
3119	    case BRW_OPCODE_ADD:
3120	       if (i == 1) {
3121		  scan_inst->src[i] = inst->src[0];
3122		  progress = true;
3123	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
3124		  /* Fit this constant in by commuting the operands */
3125		  scan_inst->src[0] = scan_inst->src[1];
3126		  scan_inst->src[1] = inst->src[0];
3127		  progress = true;
3128	       }
3129	       break;
3130
3131	    case BRW_OPCODE_CMP:
3132	       if (i == 1) {
3133		  scan_inst->src[i] = inst->src[0];
3134		  progress = true;
3135	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
3136		  uint32_t new_cmod;
3137
3138		  new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
3139		  if (new_cmod != ~0u) {
3140		     /* Fit this constant in by swapping the operands and
3141		      * flipping the test
3142		      */
3143		     scan_inst->src[0] = scan_inst->src[1];
3144		     scan_inst->src[1] = inst->src[0];
3145		     scan_inst->conditional_mod = new_cmod;
3146		     progress = true;
3147		  }
3148	       }
3149	       break;
3150
3151	    case BRW_OPCODE_SEL:
3152	       if (i == 1) {
3153		  scan_inst->src[i] = inst->src[0];
3154		  progress = true;
3155	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
3156		  /* Fit this constant in by swapping the operands and
3157		   * flipping the predicate
3158		   */
3159		  scan_inst->src[0] = scan_inst->src[1];
3160		  scan_inst->src[1] = inst->src[0];
3161		  scan_inst->predicate_inverse = !scan_inst->predicate_inverse;
3162		  progress = true;
3163	       }
3164	       break;
3165	    }
3166	 }
3167
3168	 if (scan_inst->dst.file == GRF &&
3169	     scan_inst->dst.reg == inst->dst.reg &&
3170	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3171	      scan_inst->is_tex())) {
3172	    break;
3173	 }
3174      }
3175   }
3176
3177   if (progress)
3178       this->live_intervals_valid = false;
3179
3180   return progress;
3181}
3182/**
3183 * Must be called after calculate_live_intervales() to remove unused
3184 * writes to registers -- register allocation will fail otherwise
3185 * because something deffed but not used won't be considered to
3186 * interfere with other regs.
3187 */
3188bool
3189fs_visitor::dead_code_eliminate()
3190{
3191   bool progress = false;
3192   int pc = 0;
3193
3194   calculate_live_intervals();
3195
3196   foreach_iter(exec_list_iterator, iter, this->instructions) {
3197      fs_inst *inst = (fs_inst *)iter.get();
3198
3199      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
3200	 inst->remove();
3201	 progress = true;
3202      }
3203
3204      pc++;
3205   }
3206
3207   if (progress)
3208      live_intervals_valid = false;
3209
3210   return progress;
3211}
3212
3213bool
3214fs_visitor::register_coalesce()
3215{
3216   bool progress = false;
3217   int if_depth = 0;
3218   int loop_depth = 0;
3219
3220   foreach_iter(exec_list_iterator, iter, this->instructions) {
3221      fs_inst *inst = (fs_inst *)iter.get();
3222
3223      /* Make sure that we dominate the instructions we're going to
3224       * scan for interfering with our coalescing, or we won't have
3225       * scanned enough to see if anything interferes with our
3226       * coalescing.  We don't dominate the following instructions if
3227       * we're in a loop or an if block.
3228       */
3229      switch (inst->opcode) {
3230      case BRW_OPCODE_DO:
3231	 loop_depth++;
3232	 break;
3233      case BRW_OPCODE_WHILE:
3234	 loop_depth--;
3235	 break;
3236      case BRW_OPCODE_IF:
3237	 if_depth++;
3238	 break;
3239      case BRW_OPCODE_ENDIF:
3240	 if_depth--;
3241	 break;
3242      }
3243      if (loop_depth || if_depth)
3244	 continue;
3245
3246      if (inst->opcode != BRW_OPCODE_MOV ||
3247	  inst->predicated ||
3248	  inst->saturate ||
3249	  inst->dst.file != GRF || inst->src[0].file != GRF ||
3250	  inst->dst.type != inst->src[0].type)
3251	 continue;
3252
3253      bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
3254
3255      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
3256       * them: check for no writes to either one until the exit of the
3257       * program.
3258       */
3259      bool interfered = false;
3260      exec_list_iterator scan_iter = iter;
3261      scan_iter.next();
3262      for (; scan_iter.has_next(); scan_iter.next()) {
3263	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3264
3265	 if (scan_inst->dst.file == GRF) {
3266	    if (scan_inst->dst.reg == inst->dst.reg &&
3267		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3268		 scan_inst->is_tex())) {
3269	       interfered = true;
3270	       break;
3271	    }
3272	    if (scan_inst->dst.reg == inst->src[0].reg &&
3273		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3274		 scan_inst->is_tex())) {
3275	       interfered = true;
3276	       break;
3277	    }
3278	 }
3279
3280	 /* The gen6 MATH instruction can't handle source modifiers, so avoid
3281	  * coalescing those for now.  We should do something more specific.
3282	  */
3283	 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) {
3284	    interfered = true;
3285	    break;
3286	 }
3287      }
3288      if (interfered) {
3289	 continue;
3290      }
3291
3292      /* Rewrite the later usage to point at the source of the move to
3293       * be removed.
3294       */
3295      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3296	   scan_iter.next()) {
3297	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3298
3299	 for (int i = 0; i < 3; i++) {
3300	    if (scan_inst->src[i].file == GRF &&
3301		scan_inst->src[i].reg == inst->dst.reg &&
3302		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3303	       scan_inst->src[i].reg = inst->src[0].reg;
3304	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3305	       scan_inst->src[i].abs |= inst->src[0].abs;
3306	       scan_inst->src[i].negate ^= inst->src[0].negate;
3307	       scan_inst->src[i].smear = inst->src[0].smear;
3308	    }
3309	 }
3310      }
3311
3312      inst->remove();
3313      progress = true;
3314   }
3315
3316   if (progress)
3317      live_intervals_valid = false;
3318
3319   return progress;
3320}
3321
3322
3323bool
3324fs_visitor::compute_to_mrf()
3325{
3326   bool progress = false;
3327   int next_ip = 0;
3328
3329   /* Need to update the MRF tracking for compressed instructions. */
3330   if (c->dispatch_width == 16)
3331      return false;
3332
3333   calculate_live_intervals();
3334
3335   foreach_iter(exec_list_iterator, iter, this->instructions) {
3336      fs_inst *inst = (fs_inst *)iter.get();
3337
3338      int ip = next_ip;
3339      next_ip++;
3340
3341      if (inst->opcode != BRW_OPCODE_MOV ||
3342	  inst->predicated ||
3343	  inst->dst.file != MRF || inst->src[0].file != GRF ||
3344	  inst->dst.type != inst->src[0].type ||
3345	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3346	 continue;
3347
3348      /* Can't compute-to-MRF this GRF if someone else was going to
3349       * read it later.
3350       */
3351      if (this->virtual_grf_use[inst->src[0].reg] > ip)
3352	 continue;
3353
3354      /* Found a move of a GRF to a MRF.  Let's see if we can go
3355       * rewrite the thing that made this GRF to write into the MRF.
3356       */
3357      fs_inst *scan_inst;
3358      for (scan_inst = (fs_inst *)inst->prev;
3359	   scan_inst->prev != NULL;
3360	   scan_inst = (fs_inst *)scan_inst->prev) {
3361	 if (scan_inst->dst.file == GRF &&
3362	     scan_inst->dst.reg == inst->src[0].reg) {
3363	    /* Found the last thing to write our reg we want to turn
3364	     * into a compute-to-MRF.
3365	     */
3366
3367	    if (scan_inst->is_tex()) {
3368	       /* texturing writes several continuous regs, so we can't
3369		* compute-to-mrf that.
3370		*/
3371	       break;
3372	    }
3373
3374	    /* If it's predicated, it (probably) didn't populate all
3375	     * the channels.
3376	     */
3377	    if (scan_inst->predicated)
3378	       break;
3379
3380	    /* SEND instructions can't have MRF as a destination. */
3381	    if (scan_inst->mlen)
3382	       break;
3383
3384	    if (intel->gen >= 6) {
3385	       /* gen6 math instructions must have the destination be
3386		* GRF, so no compute-to-MRF for them.
3387		*/
3388	       if (scan_inst->is_math()) {
3389		  break;
3390	       }
3391	    }
3392
3393	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3394	       /* Found the creator of our MRF's source value. */
3395	       scan_inst->dst.file = MRF;
3396	       scan_inst->dst.hw_reg = inst->dst.hw_reg;
3397	       scan_inst->saturate |= inst->saturate;
3398	       inst->remove();
3399	       progress = true;
3400	    }
3401	    break;
3402	 }
3403
3404	 /* We don't handle flow control here.  Most computation of
3405	  * values that end up in MRFs are shortly before the MRF
3406	  * write anyway.
3407	  */
3408	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3409	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3410	     scan_inst->opcode == BRW_OPCODE_ELSE ||
3411	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3412	    break;
3413	 }
3414
3415	 /* You can't read from an MRF, so if someone else reads our
3416	  * MRF's source GRF that we wanted to rewrite, that stops us.
3417	  */
3418	 bool interfered = false;
3419	 for (int i = 0; i < 3; i++) {
3420	    if (scan_inst->src[i].file == GRF &&
3421		scan_inst->src[i].reg == inst->src[0].reg &&
3422		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3423	       interfered = true;
3424	    }
3425	 }
3426	 if (interfered)
3427	    break;
3428
3429	 if (scan_inst->dst.file == MRF &&
3430	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3431	    /* Somebody else wrote our MRF here, so we can't can't
3432	     * compute-to-MRF before that.
3433	     */
3434	    break;
3435	 }
3436
3437	 if (scan_inst->mlen > 0) {
3438	    /* Found a SEND instruction, which means that there are
3439	     * live values in MRFs from base_mrf to base_mrf +
3440	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3441	     * above it.
3442	     */
3443	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3444		inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3445	       break;
3446	    }
3447	 }
3448      }
3449   }
3450
3451   return progress;
3452}
3453
3454/**
3455 * Walks through basic blocks, locking for repeated MRF writes and
3456 * removing the later ones.
3457 */
3458bool
3459fs_visitor::remove_duplicate_mrf_writes()
3460{
3461   fs_inst *last_mrf_move[16];
3462   bool progress = false;
3463
3464   /* Need to update the MRF tracking for compressed instructions. */
3465   if (c->dispatch_width == 16)
3466      return false;
3467
3468   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3469
3470   foreach_iter(exec_list_iterator, iter, this->instructions) {
3471      fs_inst *inst = (fs_inst *)iter.get();
3472
3473      switch (inst->opcode) {
3474      case BRW_OPCODE_DO:
3475      case BRW_OPCODE_WHILE:
3476      case BRW_OPCODE_IF:
3477      case BRW_OPCODE_ELSE:
3478      case BRW_OPCODE_ENDIF:
3479	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3480	 continue;
3481      default:
3482	 break;
3483      }
3484
3485      if (inst->opcode == BRW_OPCODE_MOV &&
3486	  inst->dst.file == MRF) {
3487	 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3488	 if (prev_inst && inst->equals(prev_inst)) {
3489	    inst->remove();
3490	    progress = true;
3491	    continue;
3492	 }
3493      }
3494
3495      /* Clear out the last-write records for MRFs that were overwritten. */
3496      if (inst->dst.file == MRF) {
3497	 last_mrf_move[inst->dst.hw_reg] = NULL;
3498      }
3499
3500      if (inst->mlen > 0) {
3501	 /* Found a SEND instruction, which will include two or fewer
3502	  * implied MRF writes.  We could do better here.
3503	  */
3504	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3505	    last_mrf_move[inst->base_mrf + i] = NULL;
3506	 }
3507      }
3508
3509      /* Clear out any MRF move records whose sources got overwritten. */
3510      if (inst->dst.file == GRF) {
3511	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3512	    if (last_mrf_move[i] &&
3513		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3514	       last_mrf_move[i] = NULL;
3515	    }
3516	 }
3517      }
3518
3519      if (inst->opcode == BRW_OPCODE_MOV &&
3520	  inst->dst.file == MRF &&
3521	  inst->src[0].file == GRF &&
3522	  !inst->predicated) {
3523	 last_mrf_move[inst->dst.hw_reg] = inst;
3524      }
3525   }
3526
3527   return progress;
3528}
3529
3530bool
3531fs_visitor::virtual_grf_interferes(int a, int b)
3532{
3533   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3534   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3535
3536   /* We can't handle dead register writes here, without iterating
3537    * over the whole instruction stream to find every single dead
3538    * write to that register to compare to the live interval of the
3539    * other register.  Just assert that dead_code_eliminate() has been
3540    * called.
3541    */
3542   assert((this->virtual_grf_use[a] != -1 ||
3543	   this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3544	  (this->virtual_grf_use[b] != -1 ||
3545	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
3546
3547   /* If the register is used to store 16 values of less than float
3548    * size (only the case for pixel_[xy]), then we can't allocate
3549    * another dword-sized thing to that register that would be used in
3550    * the same instruction.  This is because when the GPU decodes (for
3551    * example):
3552    *
3553    * (declare (in ) vec4 gl_FragCoord@0x97766a0)
3554    * add(16)         g6<1>F          g6<8,8,1>UW     0.5F { align1 compr };
3555    *
3556    * it's actually processed as:
3557    * add(8)         g6<1>F          g6<8,8,1>UW     0.5F { align1 };
3558    * add(8)         g7<1>F          g6.8<8,8,1>UW   0.5F { align1 sechalf };
3559    *
3560    * so our second half values in g6 got overwritten in the first
3561    * half.
3562    */
3563   if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
3564				   this->pixel_x.reg == b ||
3565				   this->pixel_y.reg == a ||
3566				   this->pixel_y.reg == b)) {
3567      return start <= end;
3568   }
3569
3570   return start < end;
3571}
3572
3573static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3574{
3575   struct brw_reg brw_reg;
3576
3577   switch (reg->file) {
3578   case GRF:
3579   case ARF:
3580   case MRF:
3581      if (reg->smear == -1) {
3582	 brw_reg = brw_vec8_reg(reg->file,
3583				reg->hw_reg, 0);
3584      } else {
3585	 brw_reg = brw_vec1_reg(reg->file,
3586				reg->hw_reg, reg->smear);
3587      }
3588      brw_reg = retype(brw_reg, reg->type);
3589      if (reg->sechalf)
3590	 brw_reg = sechalf(brw_reg);
3591      break;
3592   case IMM:
3593      switch (reg->type) {
3594      case BRW_REGISTER_TYPE_F:
3595	 brw_reg = brw_imm_f(reg->imm.f);
3596	 break;
3597      case BRW_REGISTER_TYPE_D:
3598	 brw_reg = brw_imm_d(reg->imm.i);
3599	 break;
3600      case BRW_REGISTER_TYPE_UD:
3601	 brw_reg = brw_imm_ud(reg->imm.u);
3602	 break;
3603      default:
3604	 assert(!"not reached");
3605	 brw_reg = brw_null_reg();
3606	 break;
3607      }
3608      break;
3609   case FIXED_HW_REG:
3610      brw_reg = reg->fixed_hw_reg;
3611      break;
3612   case BAD_FILE:
3613      /* Probably unused. */
3614      brw_reg = brw_null_reg();
3615      break;
3616   case UNIFORM:
3617      assert(!"not reached");
3618      brw_reg = brw_null_reg();
3619      break;
3620   default:
3621      assert(!"not reached");
3622      brw_reg = brw_null_reg();
3623      break;
3624   }
3625   if (reg->abs)
3626      brw_reg = brw_abs(brw_reg);
3627   if (reg->negate)
3628      brw_reg = negate(brw_reg);
3629
3630   return brw_reg;
3631}
3632
3633void
3634fs_visitor::generate_code()
3635{
3636   int last_native_inst = p->nr_insn;
3637   const char *last_annotation_string = NULL;
3638   ir_instruction *last_annotation_ir = NULL;
3639
3640   int if_stack_array_size = 16;
3641   int loop_stack_array_size = 16;
3642   int if_stack_depth = 0, loop_stack_depth = 0;
3643   brw_instruction **if_stack =
3644      rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size);
3645   brw_instruction **loop_stack =
3646      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
3647   int *if_depth_in_loop =
3648      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
3649
3650
3651   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3652      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
3653	     ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
3654   }
3655
3656   foreach_iter(exec_list_iterator, iter, this->instructions) {
3657      fs_inst *inst = (fs_inst *)iter.get();
3658      struct brw_reg src[3], dst;
3659
3660      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3661	 if (last_annotation_ir != inst->ir) {
3662	    last_annotation_ir = inst->ir;
3663	    if (last_annotation_ir) {
3664	       printf("   ");
3665	       last_annotation_ir->print();
3666	       printf("\n");
3667	    }
3668	 }
3669	 if (last_annotation_string != inst->annotation) {
3670	    last_annotation_string = inst->annotation;
3671	    if (last_annotation_string)
3672	       printf("   %s\n", last_annotation_string);
3673	 }
3674      }
3675
3676      for (unsigned int i = 0; i < 3; i++) {
3677	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3678      }
3679      dst = brw_reg_from_fs_reg(&inst->dst);
3680
3681      brw_set_conditionalmod(p, inst->conditional_mod);
3682      brw_set_predicate_control(p, inst->predicated);
3683      brw_set_predicate_inverse(p, inst->predicate_inverse);
3684      brw_set_saturate(p, inst->saturate);
3685
3686      if (inst->force_uncompressed || c->dispatch_width == 8) {
3687	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
3688      } else if (inst->force_sechalf) {
3689	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
3690      } else {
3691	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
3692      }
3693
3694      switch (inst->opcode) {
3695      case BRW_OPCODE_MOV:
3696	 brw_MOV(p, dst, src[0]);
3697	 break;
3698      case BRW_OPCODE_ADD:
3699	 brw_ADD(p, dst, src[0], src[1]);
3700	 break;
3701      case BRW_OPCODE_MUL:
3702	 brw_MUL(p, dst, src[0], src[1]);
3703	 break;
3704
3705      case BRW_OPCODE_FRC:
3706	 brw_FRC(p, dst, src[0]);
3707	 break;
3708      case BRW_OPCODE_RNDD:
3709	 brw_RNDD(p, dst, src[0]);
3710	 break;
3711      case BRW_OPCODE_RNDE:
3712	 brw_RNDE(p, dst, src[0]);
3713	 break;
3714      case BRW_OPCODE_RNDZ:
3715	 brw_RNDZ(p, dst, src[0]);
3716	 break;
3717
3718      case BRW_OPCODE_AND:
3719	 brw_AND(p, dst, src[0], src[1]);
3720	 break;
3721      case BRW_OPCODE_OR:
3722	 brw_OR(p, dst, src[0], src[1]);
3723	 break;
3724      case BRW_OPCODE_XOR:
3725	 brw_XOR(p, dst, src[0], src[1]);
3726	 break;
3727      case BRW_OPCODE_NOT:
3728	 brw_NOT(p, dst, src[0]);
3729	 break;
3730      case BRW_OPCODE_ASR:
3731	 brw_ASR(p, dst, src[0], src[1]);
3732	 break;
3733      case BRW_OPCODE_SHR:
3734	 brw_SHR(p, dst, src[0], src[1]);
3735	 break;
3736      case BRW_OPCODE_SHL:
3737	 brw_SHL(p, dst, src[0], src[1]);
3738	 break;
3739
3740      case BRW_OPCODE_CMP:
3741	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3742	 break;
3743      case BRW_OPCODE_SEL:
3744	 brw_SEL(p, dst, src[0], src[1]);
3745	 break;
3746
3747      case BRW_OPCODE_IF:
3748	 if (inst->src[0].file != BAD_FILE) {
3749	    assert(intel->gen >= 6);
3750	    if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]);
3751	 } else {
3752	    if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3753	 }
3754	 if_depth_in_loop[loop_stack_depth]++;
3755	 if_stack_depth++;
3756	 if (if_stack_array_size <= if_stack_depth) {
3757	    if_stack_array_size *= 2;
3758	    if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *,
3759			        if_stack_array_size);
3760	 }
3761	 break;
3762
3763      case BRW_OPCODE_ELSE:
3764	 if_stack[if_stack_depth - 1] =
3765	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
3766	 break;
3767      case BRW_OPCODE_ENDIF:
3768	 if_stack_depth--;
3769	 brw_ENDIF(p , if_stack[if_stack_depth]);
3770	 if_depth_in_loop[loop_stack_depth]--;
3771	 break;
3772
3773      case BRW_OPCODE_DO:
3774	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3775	 if (loop_stack_array_size <= loop_stack_depth) {
3776	    loop_stack_array_size *= 2;
3777	    loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
3778				  loop_stack_array_size);
3779	    if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
3780				        loop_stack_array_size);
3781	 }
3782	 if_depth_in_loop[loop_stack_depth] = 0;
3783	 break;
3784
3785      case BRW_OPCODE_BREAK:
3786	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3787	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3788	 break;
3789      case BRW_OPCODE_CONTINUE:
3790	 /* FINISHME: We need to write the loop instruction support still. */
3791	 if (intel->gen >= 6)
3792	    gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
3793	 else
3794	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3795	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3796	 break;
3797
3798      case BRW_OPCODE_WHILE: {
3799	 struct brw_instruction *inst0, *inst1;
3800	 GLuint br = 1;
3801
3802	 if (intel->gen >= 5)
3803	    br = 2;
3804
3805	 assert(loop_stack_depth > 0);
3806	 loop_stack_depth--;
3807	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3808	 if (intel->gen < 6) {
3809	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
3810	    while (inst0 > loop_stack[loop_stack_depth]) {
3811	       inst0--;
3812	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3813		   inst0->bits3.if_else.jump_count == 0) {
3814		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3815	    }
3816	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3817			inst0->bits3.if_else.jump_count == 0) {
3818		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3819	       }
3820	    }
3821	 }
3822      }
3823	 break;
3824
3825      case FS_OPCODE_RCP:
3826      case FS_OPCODE_RSQ:
3827      case FS_OPCODE_SQRT:
3828      case FS_OPCODE_EXP2:
3829      case FS_OPCODE_LOG2:
3830      case FS_OPCODE_POW:
3831      case FS_OPCODE_SIN:
3832      case FS_OPCODE_COS:
3833	 generate_math(inst, dst, src);
3834	 break;
3835      case FS_OPCODE_PIXEL_X:
3836	 generate_pixel_xy(dst, true);
3837	 break;
3838      case FS_OPCODE_PIXEL_Y:
3839	 generate_pixel_xy(dst, false);
3840	 break;
3841      case FS_OPCODE_CINTERP:
3842	 brw_MOV(p, dst, src[0]);
3843	 break;
3844      case FS_OPCODE_LINTERP:
3845	 generate_linterp(inst, dst, src);
3846	 break;
3847      case FS_OPCODE_TEX:
3848      case FS_OPCODE_TXB:
3849      case FS_OPCODE_TXD:
3850      case FS_OPCODE_TXL:
3851	 generate_tex(inst, dst, src[0]);
3852	 break;
3853      case FS_OPCODE_DISCARD_NOT:
3854	 generate_discard_not(inst, dst);
3855	 break;
3856      case FS_OPCODE_DISCARD_AND:
3857	 generate_discard_and(inst, src[0]);
3858	 break;
3859      case FS_OPCODE_DDX:
3860	 generate_ddx(inst, dst, src[0]);
3861	 break;
3862      case FS_OPCODE_DDY:
3863	 generate_ddy(inst, dst, src[0]);
3864	 break;
3865
3866      case FS_OPCODE_SPILL:
3867	 generate_spill(inst, src[0]);
3868	 break;
3869
3870      case FS_OPCODE_UNSPILL:
3871	 generate_unspill(inst, dst);
3872	 break;
3873
3874      case FS_OPCODE_PULL_CONSTANT_LOAD:
3875	 generate_pull_constant_load(inst, dst);
3876	 break;
3877
3878      case FS_OPCODE_FB_WRITE:
3879	 generate_fb_write(inst);
3880	 break;
3881      default:
3882	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3883	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3884			  brw_opcodes[inst->opcode].name);
3885	 } else {
3886	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3887	 }
3888	 fail("unsupported opcode in FS\n");
3889      }
3890
3891      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3892	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3893	    if (0) {
3894	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3895		      ((uint32_t *)&p->store[i])[3],
3896		      ((uint32_t *)&p->store[i])[2],
3897		      ((uint32_t *)&p->store[i])[1],
3898		      ((uint32_t *)&p->store[i])[0]);
3899	    }
3900	    brw_disasm(stdout, &p->store[i], intel->gen);
3901	 }
3902      }
3903
3904      last_native_inst = p->nr_insn;
3905   }
3906
3907   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3908      printf("\n");
3909   }
3910
3911   ralloc_free(if_stack);
3912   ralloc_free(loop_stack);
3913   ralloc_free(if_depth_in_loop);
3914
3915   brw_set_uip_jip(p);
3916
3917   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3918    * emit issues, it doesn't get the jump distances into the output,
3919    * which is often something we want to debug.  So this is here in
3920    * case you're doing that.
3921    */
3922   if (0) {
3923      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3924	 for (unsigned int i = 0; i < p->nr_insn; i++) {
3925	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3926		   ((uint32_t *)&p->store[i])[3],
3927		   ((uint32_t *)&p->store[i])[2],
3928		   ((uint32_t *)&p->store[i])[1],
3929		   ((uint32_t *)&p->store[i])[0]);
3930	    brw_disasm(stdout, &p->store[i], intel->gen);
3931	 }
3932      }
3933   }
3934}
3935
3936bool
3937fs_visitor::run()
3938{
3939   uint32_t prog_offset_16 = 0;
3940
3941   brw_wm_payload_setup(brw, c);
3942
3943   if (c->dispatch_width == 16) {
3944      if (c->prog_data.curb_read_length) {
3945	 /* Haven't hooked in support for uniforms through the 16-wide
3946	  * version yet.
3947	  */
3948	 return false;
3949      }
3950
3951      /* align to 64 byte boundary. */
3952      while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
3953	 brw_NOP(p);
3954      }
3955
3956      /* Save off the start of this 16-wide program in case we succeed. */
3957      prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
3958
3959      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
3960   }
3961
3962   if (0) {
3963      emit_dummy_fs();
3964   } else {
3965      calculate_urb_setup();
3966      if (intel->gen < 6)
3967	 emit_interpolation_setup_gen4();
3968      else
3969	 emit_interpolation_setup_gen6();
3970
3971      /* Generate FS IR for main().  (the visitor only descends into
3972       * functions called "main").
3973       */
3974      foreach_iter(exec_list_iterator, iter, *shader->ir) {
3975	 ir_instruction *ir = (ir_instruction *)iter.get();
3976	 base_ir = ir;
3977	 ir->accept(this);
3978      }
3979
3980      emit_fb_writes();
3981
3982      split_virtual_grfs();
3983
3984      setup_paramvalues_refs();
3985      setup_pull_constants();
3986
3987      bool progress;
3988      do {
3989	 progress = false;
3990
3991	 progress = remove_duplicate_mrf_writes() || progress;
3992
3993	 progress = propagate_constants() || progress;
3994	 progress = register_coalesce() || progress;
3995	 progress = compute_to_mrf() || progress;
3996	 progress = dead_code_eliminate() || progress;
3997      } while (progress);
3998
3999      schedule_instructions();
4000
4001      assign_curb_setup();
4002      assign_urb_setup();
4003
4004      if (0) {
4005	 /* Debug of register spilling: Go spill everything. */
4006	 int virtual_grf_count = virtual_grf_next;
4007	 for (int i = 1; i < virtual_grf_count; i++) {
4008	    spill_reg(i);
4009	 }
4010      }
4011
4012      if (0)
4013	 assign_regs_trivial();
4014      else {
4015	 while (!assign_regs()) {
4016	    if (failed)
4017	       break;
4018	 }
4019      }
4020   }
4021   assert(force_uncompressed_stack == 0);
4022   assert(force_sechalf_stack == 0);
4023
4024   if (failed)
4025      return false;
4026
4027   generate_code();
4028
4029   if (c->dispatch_width == 8) {
4030      c->prog_data.total_grf = grf_used;
4031   } else {
4032      c->prog_data.total_grf_16 = grf_used;
4033      c->prog_data.prog_offset_16 = prog_offset_16;
4034   }
4035
4036   return !failed;
4037}
4038
4039bool
4040brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
4041{
4042   struct intel_context *intel = &brw->intel;
4043   struct gl_context *ctx = &intel->ctx;
4044   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
4045
4046   if (!prog)
4047      return false;
4048
4049   struct brw_shader *shader =
4050     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4051   if (!shader)
4052      return false;
4053
4054   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4055      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
4056      _mesa_print_ir(shader->ir, NULL);
4057      printf("\n\n");
4058   }
4059
4060   /* Now the main event: Visit the shader IR and generate our FS IR for it.
4061    */
4062   c->dispatch_width = 8;
4063
4064   fs_visitor v(c, shader);
4065   if (!v.run()) {
4066      /* FINISHME: Cleanly fail, test at link time, etc. */
4067      assert(!"not reached");
4068      return false;
4069   }
4070
4071   if (intel->gen >= 5) {
4072      c->dispatch_width = 16;
4073      fs_visitor v2(c, shader);
4074      v2.run();
4075   }
4076
4077   c->prog_data.dispatch_width = 8;
4078
4079   return true;
4080}
4081