brw_fs.cpp revision 662f1b48bd1a02907bb42ecda889a3aa52a5755d
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44}
45#include "brw_fs.h"
46#include "../glsl/glsl_types.h"
47#include "../glsl/ir_optimization.h"
48#include "../glsl/ir_print_visitor.h"
49
50#define MAX_INSTRUCTION (1 << 30)
51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53struct gl_shader *
54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55{
56   struct brw_shader *shader;
57
58   shader = rzalloc(NULL, struct brw_shader);
59   if (shader) {
60      shader->base.Type = type;
61      shader->base.Name = name;
62      _mesa_init_shader(ctx, &shader->base);
63   }
64
65   return &shader->base;
66}
67
68struct gl_shader_program *
69brw_new_shader_program(struct gl_context *ctx, GLuint name)
70{
71   struct brw_shader_program *prog;
72   prog = rzalloc(NULL, struct brw_shader_program);
73   if (prog) {
74      prog->base.Name = name;
75      _mesa_init_shader_program(ctx, &prog->base);
76   }
77   return &prog->base;
78}
79
80GLboolean
81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
82{
83   struct brw_context *brw = brw_context(ctx);
84   struct intel_context *intel = &brw->intel;
85
86   struct brw_shader *shader =
87      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
88   if (shader != NULL) {
89      void *mem_ctx = ralloc_context(NULL);
90      bool progress;
91
92      if (shader->ir)
93	 ralloc_free(shader->ir);
94      shader->ir = new(shader) exec_list;
95      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
96
97      do_mat_op_to_vec(shader->ir);
98      lower_instructions(shader->ir,
99			 MOD_TO_FRACT |
100			 DIV_TO_MUL_RCP |
101			 SUB_TO_ADD_NEG |
102			 EXP_TO_EXP2 |
103			 LOG_TO_LOG2);
104
105      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
106       * if-statements need to be flattened.
107       */
108      if (intel->gen < 6)
109	 lower_if_to_cond_assign(shader->ir, 16);
110
111      do_lower_texture_projection(shader->ir);
112      do_vec_index_to_cond_assign(shader->ir);
113      brw_do_cubemap_normalize(shader->ir);
114      lower_noise(shader->ir);
115      lower_quadop_vector(shader->ir, false);
116      lower_variable_index_to_cond_assign(shader->ir,
117					  GL_TRUE, /* input */
118					  GL_TRUE, /* output */
119					  GL_TRUE, /* temp */
120					  GL_TRUE /* uniform */
121					  );
122
123      do {
124	 progress = false;
125
126	 brw_do_channel_expressions(shader->ir);
127	 brw_do_vector_splitting(shader->ir);
128
129	 progress = do_lower_jumps(shader->ir, true, true,
130				   true, /* main return */
131				   false, /* continue */
132				   false /* loops */
133				   ) || progress;
134
135	 progress = do_common_optimization(shader->ir, true, 32) || progress;
136      } while (progress);
137
138      validate_ir_tree(shader->ir);
139
140      reparent_ir(shader->ir, shader->ir);
141      ralloc_free(mem_ctx);
142   }
143
144   if (!_mesa_ir_link_shader(ctx, prog))
145      return GL_FALSE;
146
147   return GL_TRUE;
148}
149
150static int
151type_size(const struct glsl_type *type)
152{
153   unsigned int size, i;
154
155   switch (type->base_type) {
156   case GLSL_TYPE_UINT:
157   case GLSL_TYPE_INT:
158   case GLSL_TYPE_FLOAT:
159   case GLSL_TYPE_BOOL:
160      return type->components();
161   case GLSL_TYPE_ARRAY:
162      return type_size(type->fields.array) * type->length;
163   case GLSL_TYPE_STRUCT:
164      size = 0;
165      for (i = 0; i < type->length; i++) {
166	 size += type_size(type->fields.structure[i].type);
167      }
168      return size;
169   case GLSL_TYPE_SAMPLER:
170      /* Samplers take up no register space, since they're baked in at
171       * link time.
172       */
173      return 0;
174   default:
175      assert(!"not reached");
176      return 0;
177   }
178}
179
180void
181fs_visitor::fail(const char *format, ...)
182{
183   if (!failed) {
184      failed = true;
185
186      if (INTEL_DEBUG & DEBUG_WM) {
187	 fprintf(stderr, "FS compile failed: ");
188
189	 va_list va;
190	 va_start(va, format);
191	 vfprintf(stderr, format, va);
192	 va_end(va);
193      }
194   }
195}
196
197void
198fs_visitor::push_force_uncompressed()
199{
200   force_uncompressed_stack++;
201}
202
203void
204fs_visitor::pop_force_uncompressed()
205{
206   force_uncompressed_stack--;
207   assert(force_uncompressed_stack >= 0);
208}
209
210void
211fs_visitor::push_force_sechalf()
212{
213   force_sechalf_stack++;
214}
215
216void
217fs_visitor::pop_force_sechalf()
218{
219   force_sechalf_stack--;
220   assert(force_sechalf_stack >= 0);
221}
222
223/**
224 * Returns how many MRFs an FS opcode will write over.
225 *
226 * Note that this is not the 0 or 1 implied writes in an actual gen
227 * instruction -- the FS opcodes often generate MOVs in addition.
228 */
229int
230fs_visitor::implied_mrf_writes(fs_inst *inst)
231{
232   if (inst->mlen == 0)
233      return 0;
234
235   switch (inst->opcode) {
236   case FS_OPCODE_RCP:
237   case FS_OPCODE_RSQ:
238   case FS_OPCODE_SQRT:
239   case FS_OPCODE_EXP2:
240   case FS_OPCODE_LOG2:
241   case FS_OPCODE_SIN:
242   case FS_OPCODE_COS:
243      return 1 * c->dispatch_width / 8;
244   case FS_OPCODE_POW:
245      return 2 * c->dispatch_width / 8;
246   case FS_OPCODE_TEX:
247   case FS_OPCODE_TXB:
248   case FS_OPCODE_TXD:
249   case FS_OPCODE_TXL:
250      return 1;
251   case FS_OPCODE_FB_WRITE:
252      return 2;
253   case FS_OPCODE_PULL_CONSTANT_LOAD:
254   case FS_OPCODE_UNSPILL:
255      return 1;
256   case FS_OPCODE_SPILL:
257      return 2;
258   default:
259      assert(!"not reached");
260      return inst->mlen;
261   }
262}
263
264int
265fs_visitor::virtual_grf_alloc(int size)
266{
267   if (virtual_grf_array_size <= virtual_grf_next) {
268      if (virtual_grf_array_size == 0)
269	 virtual_grf_array_size = 16;
270      else
271	 virtual_grf_array_size *= 2;
272      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
273				   virtual_grf_array_size);
274
275      /* This slot is always unused. */
276      virtual_grf_sizes[0] = 0;
277   }
278   virtual_grf_sizes[virtual_grf_next] = size;
279   return virtual_grf_next++;
280}
281
282/** Fixed HW reg constructor. */
283fs_reg::fs_reg(enum register_file file, int hw_reg)
284{
285   init();
286   this->file = file;
287   this->hw_reg = hw_reg;
288   this->type = BRW_REGISTER_TYPE_F;
289}
290
291/** Fixed HW reg constructor. */
292fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
293{
294   init();
295   this->file = file;
296   this->hw_reg = hw_reg;
297   this->type = type;
298}
299
300int
301brw_type_for_base_type(const struct glsl_type *type)
302{
303   switch (type->base_type) {
304   case GLSL_TYPE_FLOAT:
305      return BRW_REGISTER_TYPE_F;
306   case GLSL_TYPE_INT:
307   case GLSL_TYPE_BOOL:
308      return BRW_REGISTER_TYPE_D;
309   case GLSL_TYPE_UINT:
310      return BRW_REGISTER_TYPE_UD;
311   case GLSL_TYPE_ARRAY:
312   case GLSL_TYPE_STRUCT:
313   case GLSL_TYPE_SAMPLER:
314      /* These should be overridden with the type of the member when
315       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
316       * way to trip up if we don't.
317       */
318      return BRW_REGISTER_TYPE_UD;
319   default:
320      assert(!"not reached");
321      return BRW_REGISTER_TYPE_F;
322   }
323}
324
325/** Automatic reg constructor. */
326fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
327{
328   init();
329
330   this->file = GRF;
331   this->reg = v->virtual_grf_alloc(type_size(type));
332   this->reg_offset = 0;
333   this->type = brw_type_for_base_type(type);
334}
335
336fs_reg *
337fs_visitor::variable_storage(ir_variable *var)
338{
339   return (fs_reg *)hash_table_find(this->variable_ht, var);
340}
341
342/* Our support for uniforms is piggy-backed on the struct
343 * gl_fragment_program, because that's where the values actually
344 * get stored, rather than in some global gl_shader_program uniform
345 * store.
346 */
347int
348fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
349{
350   unsigned int offset = 0;
351
352   if (type->is_matrix()) {
353      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
354							type->vector_elements,
355							1);
356
357      for (unsigned int i = 0; i < type->matrix_columns; i++) {
358	 offset += setup_uniform_values(loc + offset, column);
359      }
360
361      return offset;
362   }
363
364   switch (type->base_type) {
365   case GLSL_TYPE_FLOAT:
366   case GLSL_TYPE_UINT:
367   case GLSL_TYPE_INT:
368   case GLSL_TYPE_BOOL:
369      for (unsigned int i = 0; i < type->vector_elements; i++) {
370	 unsigned int param = c->prog_data.nr_params++;
371
372	 assert(param < ARRAY_SIZE(c->prog_data.param));
373
374	 switch (type->base_type) {
375	 case GLSL_TYPE_FLOAT:
376	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
377	    break;
378	 case GLSL_TYPE_UINT:
379	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
380	    break;
381	 case GLSL_TYPE_INT:
382	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
383	    break;
384	 case GLSL_TYPE_BOOL:
385	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
386	    break;
387	 default:
388	    assert(!"not reached");
389	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
390	    break;
391	 }
392	 this->param_index[param] = loc;
393	 this->param_offset[param] = i;
394      }
395      return 1;
396
397   case GLSL_TYPE_STRUCT:
398      for (unsigned int i = 0; i < type->length; i++) {
399	 offset += setup_uniform_values(loc + offset,
400					type->fields.structure[i].type);
401      }
402      return offset;
403
404   case GLSL_TYPE_ARRAY:
405      for (unsigned int i = 0; i < type->length; i++) {
406	 offset += setup_uniform_values(loc + offset, type->fields.array);
407      }
408      return offset;
409
410   case GLSL_TYPE_SAMPLER:
411      /* The sampler takes up a slot, but we don't use any values from it. */
412      return 1;
413
414   default:
415      assert(!"not reached");
416      return 0;
417   }
418}
419
420
421/* Our support for builtin uniforms is even scarier than non-builtin.
422 * It sits on top of the PROG_STATE_VAR parameters that are
423 * automatically updated from GL context state.
424 */
425void
426fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
427{
428   const ir_state_slot *const slots = ir->state_slots;
429   assert(ir->state_slots != NULL);
430
431   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
432      /* This state reference has already been setup by ir_to_mesa, but we'll
433       * get the same index back here.
434       */
435      int index = _mesa_add_state_reference(this->fp->Base.Parameters,
436					    (gl_state_index *)slots[i].tokens);
437
438      /* Add each of the unique swizzles of the element as a parameter.
439       * This'll end up matching the expected layout of the
440       * array/matrix/structure we're trying to fill in.
441       */
442      int last_swiz = -1;
443      for (unsigned int j = 0; j < 4; j++) {
444	 int swiz = GET_SWZ(slots[i].swizzle, j);
445	 if (swiz == last_swiz)
446	    break;
447	 last_swiz = swiz;
448
449	 c->prog_data.param_convert[c->prog_data.nr_params] =
450	    PARAM_NO_CONVERT;
451	 this->param_index[c->prog_data.nr_params] = index;
452	 this->param_offset[c->prog_data.nr_params] = swiz;
453	 c->prog_data.nr_params++;
454      }
455   }
456}
457
458fs_reg *
459fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
460{
461   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
462   fs_reg wpos = *reg;
463   fs_reg neg_y = this->pixel_y;
464   neg_y.negate = true;
465   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
466
467   /* gl_FragCoord.x */
468   if (ir->pixel_center_integer) {
469      emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
470   } else {
471      emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
472   }
473   wpos.reg_offset++;
474
475   /* gl_FragCoord.y */
476   if (!flip && ir->pixel_center_integer) {
477      emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
478   } else {
479      fs_reg pixel_y = this->pixel_y;
480      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
481
482      if (flip) {
483	 pixel_y.negate = true;
484	 offset += c->key.drawable_height - 1.0;
485      }
486
487      emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
488   }
489   wpos.reg_offset++;
490
491   /* gl_FragCoord.z */
492   if (intel->gen >= 6) {
493      emit(BRW_OPCODE_MOV, wpos,
494	   fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
495   } else {
496      emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
497	   interp_reg(FRAG_ATTRIB_WPOS, 2));
498   }
499   wpos.reg_offset++;
500
501   /* gl_FragCoord.w: Already set up in emit_interpolation */
502   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
503
504   return reg;
505}
506
507fs_reg *
508fs_visitor::emit_general_interpolation(ir_variable *ir)
509{
510   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
511   /* Interpolation is always in floating point regs. */
512   reg->type = BRW_REGISTER_TYPE_F;
513   fs_reg attr = *reg;
514
515   unsigned int array_elements;
516   const glsl_type *type;
517
518   if (ir->type->is_array()) {
519      array_elements = ir->type->length;
520      if (array_elements == 0) {
521	 fail("dereferenced array '%s' has length 0\n", ir->name);
522      }
523      type = ir->type->fields.array;
524   } else {
525      array_elements = 1;
526      type = ir->type;
527   }
528
529   int location = ir->location;
530   for (unsigned int i = 0; i < array_elements; i++) {
531      for (unsigned int j = 0; j < type->matrix_columns; j++) {
532	 if (urb_setup[location] == -1) {
533	    /* If there's no incoming setup data for this slot, don't
534	     * emit interpolation for it.
535	     */
536	    attr.reg_offset += type->vector_elements;
537	    location++;
538	    continue;
539	 }
540
541	 bool is_gl_Color =
542	    location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
543
544	 if (c->key.flat_shade && is_gl_Color) {
545	    /* Constant interpolation (flat shading) case. The SF has
546	     * handed us defined values in only the constant offset
547	     * field of the setup reg.
548	     */
549	    for (unsigned int k = 0; k < type->vector_elements; k++) {
550	       struct brw_reg interp = interp_reg(location, k);
551	       interp = suboffset(interp, 3);
552	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
553	       attr.reg_offset++;
554	    }
555	 } else {
556	    /* Perspective interpolation case. */
557	    for (unsigned int k = 0; k < type->vector_elements; k++) {
558	       struct brw_reg interp = interp_reg(location, k);
559	       emit(FS_OPCODE_LINTERP, attr,
560		    this->delta_x, this->delta_y, fs_reg(interp));
561	       attr.reg_offset++;
562	    }
563
564	    if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) {
565	       attr.reg_offset -= type->vector_elements;
566	       for (unsigned int k = 0; k < type->vector_elements; k++) {
567		  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
568		  attr.reg_offset++;
569	       }
570	    }
571	 }
572	 location++;
573      }
574   }
575
576   return reg;
577}
578
579fs_reg *
580fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
581{
582   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
583
584   /* The frontfacing comes in as a bit in the thread payload. */
585   if (intel->gen >= 6) {
586      emit(BRW_OPCODE_ASR, *reg,
587	   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
588	   fs_reg(15));
589      emit(BRW_OPCODE_NOT, *reg, *reg);
590      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
591   } else {
592      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
593      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
594       * us front face
595       */
596      fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
597			   fs_reg(r1_6ud),
598			   fs_reg(1u << 31));
599      inst->conditional_mod = BRW_CONDITIONAL_L;
600      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
601   }
602
603   return reg;
604}
605
606fs_inst *
607fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
608{
609   switch (opcode) {
610   case FS_OPCODE_RCP:
611   case FS_OPCODE_RSQ:
612   case FS_OPCODE_SQRT:
613   case FS_OPCODE_EXP2:
614   case FS_OPCODE_LOG2:
615   case FS_OPCODE_SIN:
616   case FS_OPCODE_COS:
617      break;
618   default:
619      assert(!"not reached: bad math opcode");
620      return NULL;
621   }
622
623   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
624    * might be able to do better by doing execsize = 1 math and then
625    * expanding that result out, but we would need to be careful with
626    * masking.
627    *
628    * The hardware ignores source modifiers (negate and abs) on math
629    * instructions, so we also move to a temp to set those up.
630    */
631   if (intel->gen >= 6 && (src.file == UNIFORM ||
632			   src.abs ||
633			   src.negate)) {
634      fs_reg expanded = fs_reg(this, glsl_type::float_type);
635      emit(BRW_OPCODE_MOV, expanded, src);
636      src = expanded;
637   }
638
639   fs_inst *inst = emit(opcode, dst, src);
640
641   if (intel->gen < 6) {
642      inst->base_mrf = 2;
643      inst->mlen = c->dispatch_width / 8;
644   }
645
646   return inst;
647}
648
649fs_inst *
650fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
651{
652   int base_mrf = 2;
653   fs_inst *inst;
654
655   assert(opcode == FS_OPCODE_POW);
656
657   if (intel->gen >= 6) {
658      /* Can't do hstride == 0 args to gen6 math, so expand it out.
659       *
660       * The hardware ignores source modifiers (negate and abs) on math
661       * instructions, so we also move to a temp to set those up.
662       */
663      if (src0.file == UNIFORM || src0.abs || src0.negate) {
664	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
665	 emit(BRW_OPCODE_MOV, expanded, src0);
666	 src0 = expanded;
667      }
668
669      if (src1.file == UNIFORM || src1.abs || src1.negate) {
670	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
671	 emit(BRW_OPCODE_MOV, expanded, src1);
672	 src1 = expanded;
673      }
674
675      inst = emit(opcode, dst, src0, src1);
676   } else {
677      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
678      inst = emit(opcode, dst, src0, reg_null_f);
679
680      inst->base_mrf = base_mrf;
681      inst->mlen = 2 * c->dispatch_width / 8;
682   }
683   return inst;
684}
685
686void
687fs_visitor::visit(ir_variable *ir)
688{
689   fs_reg *reg = NULL;
690
691   if (variable_storage(ir))
692      return;
693
694   if (strcmp(ir->name, "gl_FragColor") == 0) {
695      this->frag_color = ir;
696   } else if (strcmp(ir->name, "gl_FragData") == 0) {
697      this->frag_data = ir;
698   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
699      this->frag_depth = ir;
700   }
701
702   if (ir->mode == ir_var_in) {
703      if (!strcmp(ir->name, "gl_FragCoord")) {
704	 reg = emit_fragcoord_interpolation(ir);
705      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
706	 reg = emit_frontfacing_interpolation(ir);
707      } else {
708	 reg = emit_general_interpolation(ir);
709      }
710      assert(reg);
711      hash_table_insert(this->variable_ht, reg, ir);
712      return;
713   }
714
715   if (ir->mode == ir_var_uniform) {
716      int param_index = c->prog_data.nr_params;
717
718      if (!strncmp(ir->name, "gl_", 3)) {
719	 setup_builtin_uniform_values(ir);
720      } else {
721	 setup_uniform_values(ir->location, ir->type);
722      }
723
724      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
725      reg->type = brw_type_for_base_type(ir->type);
726   }
727
728   if (!reg)
729      reg = new(this->mem_ctx) fs_reg(this, ir->type);
730
731   hash_table_insert(this->variable_ht, reg, ir);
732}
733
734void
735fs_visitor::visit(ir_dereference_variable *ir)
736{
737   fs_reg *reg = variable_storage(ir->var);
738   this->result = *reg;
739}
740
741void
742fs_visitor::visit(ir_dereference_record *ir)
743{
744   const glsl_type *struct_type = ir->record->type;
745
746   ir->record->accept(this);
747
748   unsigned int offset = 0;
749   for (unsigned int i = 0; i < struct_type->length; i++) {
750      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
751	 break;
752      offset += type_size(struct_type->fields.structure[i].type);
753   }
754   this->result.reg_offset += offset;
755   this->result.type = brw_type_for_base_type(ir->type);
756}
757
758void
759fs_visitor::visit(ir_dereference_array *ir)
760{
761   ir_constant *index;
762   int element_size;
763
764   ir->array->accept(this);
765   index = ir->array_index->as_constant();
766
767   element_size = type_size(ir->type);
768   this->result.type = brw_type_for_base_type(ir->type);
769
770   if (index) {
771      assert(this->result.file == UNIFORM ||
772	     (this->result.file == GRF &&
773	      this->result.reg != 0));
774      this->result.reg_offset += index->value.i[0] * element_size;
775   } else {
776      assert(!"FINISHME: non-constant array element");
777   }
778}
779
780/* Instruction selection: Produce a MOV.sat instead of
781 * MIN(MAX(val, 0), 1) when possible.
782 */
783bool
784fs_visitor::try_emit_saturate(ir_expression *ir)
785{
786   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
787
788   if (!sat_val)
789      return false;
790
791   sat_val->accept(this);
792   fs_reg src = this->result;
793
794   this->result = fs_reg(this, ir->type);
795   fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
796   inst->saturate = true;
797
798   return true;
799}
800
801static uint32_t
802brw_conditional_for_comparison(unsigned int op)
803{
804   switch (op) {
805   case ir_binop_less:
806      return BRW_CONDITIONAL_L;
807   case ir_binop_greater:
808      return BRW_CONDITIONAL_G;
809   case ir_binop_lequal:
810      return BRW_CONDITIONAL_LE;
811   case ir_binop_gequal:
812      return BRW_CONDITIONAL_GE;
813   case ir_binop_equal:
814   case ir_binop_all_equal: /* same as equal for scalars */
815      return BRW_CONDITIONAL_Z;
816   case ir_binop_nequal:
817   case ir_binop_any_nequal: /* same as nequal for scalars */
818      return BRW_CONDITIONAL_NZ;
819   default:
820      assert(!"not reached: bad operation for comparison");
821      return BRW_CONDITIONAL_NZ;
822   }
823}
824
825void
826fs_visitor::visit(ir_expression *ir)
827{
828   unsigned int operand;
829   fs_reg op[2], temp;
830   fs_inst *inst;
831
832   assert(ir->get_num_operands() <= 2);
833
834   if (try_emit_saturate(ir))
835      return;
836
837   for (operand = 0; operand < ir->get_num_operands(); operand++) {
838      ir->operands[operand]->accept(this);
839      if (this->result.file == BAD_FILE) {
840	 ir_print_visitor v;
841	 fail("Failed to get tree for expression operand:\n");
842	 ir->operands[operand]->accept(&v);
843      }
844      op[operand] = this->result;
845
846      /* Matrix expression operands should have been broken down to vector
847       * operations already.
848       */
849      assert(!ir->operands[operand]->type->is_matrix());
850      /* And then those vector operands should have been broken down to scalar.
851       */
852      assert(!ir->operands[operand]->type->is_vector());
853   }
854
855   /* Storage for our result.  If our result goes into an assignment, it will
856    * just get copy-propagated out, so no worries.
857    */
858   this->result = fs_reg(this, ir->type);
859
860   switch (ir->operation) {
861   case ir_unop_logic_not:
862      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
863       * ones complement of the whole register, not just bit 0.
864       */
865      emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
866      break;
867   case ir_unop_neg:
868      op[0].negate = !op[0].negate;
869      this->result = op[0];
870      break;
871   case ir_unop_abs:
872      op[0].abs = true;
873      op[0].negate = false;
874      this->result = op[0];
875      break;
876   case ir_unop_sign:
877      temp = fs_reg(this, ir->type);
878
879      emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
880
881      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
882      inst->conditional_mod = BRW_CONDITIONAL_G;
883      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
884      inst->predicated = true;
885
886      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
887      inst->conditional_mod = BRW_CONDITIONAL_L;
888      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
889      inst->predicated = true;
890
891      break;
892   case ir_unop_rcp:
893      emit_math(FS_OPCODE_RCP, this->result, op[0]);
894      break;
895
896   case ir_unop_exp2:
897      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
898      break;
899   case ir_unop_log2:
900      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
901      break;
902   case ir_unop_exp:
903   case ir_unop_log:
904      assert(!"not reached: should be handled by ir_explog_to_explog2");
905      break;
906   case ir_unop_sin:
907   case ir_unop_sin_reduced:
908      emit_math(FS_OPCODE_SIN, this->result, op[0]);
909      break;
910   case ir_unop_cos:
911   case ir_unop_cos_reduced:
912      emit_math(FS_OPCODE_COS, this->result, op[0]);
913      break;
914
915   case ir_unop_dFdx:
916      emit(FS_OPCODE_DDX, this->result, op[0]);
917      break;
918   case ir_unop_dFdy:
919      emit(FS_OPCODE_DDY, this->result, op[0]);
920      break;
921
922   case ir_binop_add:
923      emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
924      break;
925   case ir_binop_sub:
926      assert(!"not reached: should be handled by ir_sub_to_add_neg");
927      break;
928
929   case ir_binop_mul:
930      emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
931      break;
932   case ir_binop_div:
933      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
934      break;
935   case ir_binop_mod:
936      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
937      break;
938
939   case ir_binop_less:
940   case ir_binop_greater:
941   case ir_binop_lequal:
942   case ir_binop_gequal:
943   case ir_binop_equal:
944   case ir_binop_all_equal:
945   case ir_binop_nequal:
946   case ir_binop_any_nequal:
947      temp = this->result;
948      /* original gen4 does implicit conversion before comparison. */
949      if (intel->gen < 5)
950	 temp.type = op[0].type;
951
952      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
953      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
954      emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
955      break;
956
957   case ir_binop_logic_xor:
958      emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
959      break;
960
961   case ir_binop_logic_or:
962      emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
963      break;
964
965   case ir_binop_logic_and:
966      emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
967      break;
968
969   case ir_binop_dot:
970   case ir_unop_any:
971      assert(!"not reached: should be handled by brw_fs_channel_expressions");
972      break;
973
974   case ir_unop_noise:
975      assert(!"not reached: should be handled by lower_noise");
976      break;
977
978   case ir_quadop_vector:
979      assert(!"not reached: should be handled by lower_quadop_vector");
980      break;
981
982   case ir_unop_sqrt:
983      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
984      break;
985
986   case ir_unop_rsq:
987      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
988      break;
989
990   case ir_unop_i2f:
991   case ir_unop_b2f:
992   case ir_unop_b2i:
993   case ir_unop_f2i:
994      emit(BRW_OPCODE_MOV, this->result, op[0]);
995      break;
996   case ir_unop_f2b:
997   case ir_unop_i2b:
998      temp = this->result;
999      /* original gen4 does implicit conversion before comparison. */
1000      if (intel->gen < 5)
1001	 temp.type = op[0].type;
1002
1003      inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
1004      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1005      inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
1006      break;
1007
1008   case ir_unop_trunc:
1009      emit(BRW_OPCODE_RNDZ, this->result, op[0]);
1010      break;
1011   case ir_unop_ceil:
1012      op[0].negate = !op[0].negate;
1013      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1014      this->result.negate = true;
1015      break;
1016   case ir_unop_floor:
1017      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1018      break;
1019   case ir_unop_fract:
1020      inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
1021      break;
1022   case ir_unop_round_even:
1023      emit(BRW_OPCODE_RNDE, this->result, op[0]);
1024      break;
1025
1026   case ir_binop_min:
1027      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1028      inst->conditional_mod = BRW_CONDITIONAL_L;
1029
1030      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1031      inst->predicated = true;
1032      break;
1033   case ir_binop_max:
1034      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1035      inst->conditional_mod = BRW_CONDITIONAL_G;
1036
1037      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1038      inst->predicated = true;
1039      break;
1040
1041   case ir_binop_pow:
1042      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1043      break;
1044
1045   case ir_unop_bit_not:
1046      inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
1047      break;
1048   case ir_binop_bit_and:
1049      inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
1050      break;
1051   case ir_binop_bit_xor:
1052      inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
1053      break;
1054   case ir_binop_bit_or:
1055      inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
1056      break;
1057
1058   case ir_unop_u2f:
1059   case ir_binop_lshift:
1060   case ir_binop_rshift:
1061      assert(!"GLSL 1.30 features unsupported");
1062      break;
1063   }
1064}
1065
1066void
1067fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1068				   const glsl_type *type, bool predicated)
1069{
1070   switch (type->base_type) {
1071   case GLSL_TYPE_FLOAT:
1072   case GLSL_TYPE_UINT:
1073   case GLSL_TYPE_INT:
1074   case GLSL_TYPE_BOOL:
1075      for (unsigned int i = 0; i < type->components(); i++) {
1076	 l.type = brw_type_for_base_type(type);
1077	 r.type = brw_type_for_base_type(type);
1078
1079	 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
1080	 inst->predicated = predicated;
1081
1082	 l.reg_offset++;
1083	 r.reg_offset++;
1084      }
1085      break;
1086   case GLSL_TYPE_ARRAY:
1087      for (unsigned int i = 0; i < type->length; i++) {
1088	 emit_assignment_writes(l, r, type->fields.array, predicated);
1089      }
1090      break;
1091
1092   case GLSL_TYPE_STRUCT:
1093      for (unsigned int i = 0; i < type->length; i++) {
1094	 emit_assignment_writes(l, r, type->fields.structure[i].type,
1095				predicated);
1096      }
1097      break;
1098
1099   case GLSL_TYPE_SAMPLER:
1100      break;
1101
1102   default:
1103      assert(!"not reached");
1104      break;
1105   }
1106}
1107
1108void
1109fs_visitor::visit(ir_assignment *ir)
1110{
1111   struct fs_reg l, r;
1112   fs_inst *inst;
1113
1114   /* FINISHME: arrays on the lhs */
1115   ir->lhs->accept(this);
1116   l = this->result;
1117
1118   ir->rhs->accept(this);
1119   r = this->result;
1120
1121   assert(l.file != BAD_FILE);
1122   assert(r.file != BAD_FILE);
1123
1124   if (ir->condition) {
1125      emit_bool_to_cond_code(ir->condition);
1126   }
1127
1128   if (ir->lhs->type->is_scalar() ||
1129       ir->lhs->type->is_vector()) {
1130      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1131	 if (ir->write_mask & (1 << i)) {
1132	    inst = emit(BRW_OPCODE_MOV, l, r);
1133	    if (ir->condition)
1134	       inst->predicated = true;
1135	    r.reg_offset++;
1136	 }
1137	 l.reg_offset++;
1138      }
1139   } else {
1140      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1141   }
1142}
1143
1144fs_inst *
1145fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1146{
1147   int mlen;
1148   int base_mrf = 1;
1149   bool simd16 = false;
1150   fs_reg orig_dst;
1151
1152   /* g0 header. */
1153   mlen = 1;
1154
1155   if (ir->shadow_comparitor) {
1156      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1157	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1158	 coordinate.reg_offset++;
1159      }
1160      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1161      mlen += 3;
1162
1163      if (ir->op == ir_tex) {
1164	 /* There's no plain shadow compare message, so we use shadow
1165	  * compare with a bias of 0.0.
1166	  */
1167	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1168	 mlen++;
1169      } else if (ir->op == ir_txb) {
1170	 ir->lod_info.bias->accept(this);
1171	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1172	 mlen++;
1173      } else {
1174	 assert(ir->op == ir_txl);
1175	 ir->lod_info.lod->accept(this);
1176	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1177	 mlen++;
1178      }
1179
1180      ir->shadow_comparitor->accept(this);
1181      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1182      mlen++;
1183   } else if (ir->op == ir_tex) {
1184      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1185	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1186	 coordinate.reg_offset++;
1187      }
1188      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1189      mlen += 3;
1190   } else if (ir->op == ir_txd) {
1191      assert(!"TXD isn't supported on gen4 yet.");
1192   } else {
1193      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1194       * instructions.  We'll need to do SIMD16 here.
1195       */
1196      assert(ir->op == ir_txb || ir->op == ir_txl);
1197
1198      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1199	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate);
1200	 coordinate.reg_offset++;
1201      }
1202
1203      /* lod/bias appears after u/v/r. */
1204      mlen += 6;
1205
1206      if (ir->op == ir_txb) {
1207	 ir->lod_info.bias->accept(this);
1208	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1209	 mlen++;
1210      } else {
1211	 ir->lod_info.lod->accept(this);
1212	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1213	 mlen++;
1214      }
1215
1216      /* The unused upper half. */
1217      mlen++;
1218
1219      /* Now, since we're doing simd16, the return is 2 interleaved
1220       * vec4s where the odd-indexed ones are junk. We'll need to move
1221       * this weirdness around to the expected layout.
1222       */
1223      simd16 = true;
1224      orig_dst = dst;
1225      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1226						       2));
1227      dst.type = BRW_REGISTER_TYPE_F;
1228   }
1229
1230   fs_inst *inst = NULL;
1231   switch (ir->op) {
1232   case ir_tex:
1233      inst = emit(FS_OPCODE_TEX, dst);
1234      break;
1235   case ir_txb:
1236      inst = emit(FS_OPCODE_TXB, dst);
1237      break;
1238   case ir_txl:
1239      inst = emit(FS_OPCODE_TXL, dst);
1240      break;
1241   case ir_txd:
1242      inst = emit(FS_OPCODE_TXD, dst);
1243      break;
1244   case ir_txf:
1245      assert(!"GLSL 1.30 features unsupported");
1246      break;
1247   }
1248   inst->base_mrf = base_mrf;
1249   inst->mlen = mlen;
1250
1251   if (simd16) {
1252      for (int i = 0; i < 4; i++) {
1253	 emit(BRW_OPCODE_MOV, orig_dst, dst);
1254	 orig_dst.reg_offset++;
1255	 dst.reg_offset += 2;
1256      }
1257   }
1258
1259   return inst;
1260}
1261
1262/* gen5's sampler has slots for u, v, r, array index, then optional
1263 * parameters like shadow comparitor or LOD bias.  If optional
1264 * parameters aren't present, those base slots are optional and don't
1265 * need to be included in the message.
1266 *
1267 * We don't fill in the unnecessary slots regardless, which may look
1268 * surprising in the disassembly.
1269 */
1270fs_inst *
1271fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1272{
1273   int mlen = 1; /* g0 header always present. */
1274   int base_mrf = 1;
1275   int reg_width = c->dispatch_width / 8;
1276
1277   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1278      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * reg_width),
1279	   coordinate);
1280      coordinate.reg_offset++;
1281   }
1282   mlen += ir->coordinate->type->vector_elements * reg_width;
1283
1284   if (ir->shadow_comparitor) {
1285      mlen = MAX2(mlen, 1 + 4 * reg_width);
1286
1287      ir->shadow_comparitor->accept(this);
1288      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1289      mlen += reg_width;
1290   }
1291
1292   fs_inst *inst = NULL;
1293   switch (ir->op) {
1294   case ir_tex:
1295      inst = emit(FS_OPCODE_TEX, dst);
1296      break;
1297   case ir_txb:
1298      ir->lod_info.bias->accept(this);
1299      mlen = MAX2(mlen, 1 + 4 * reg_width);
1300      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1301      mlen += reg_width;
1302
1303      inst = emit(FS_OPCODE_TXB, dst);
1304
1305      break;
1306   case ir_txl:
1307      ir->lod_info.lod->accept(this);
1308      mlen = MAX2(mlen, 1 + 4 * reg_width);
1309      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1310      mlen += reg_width;
1311
1312      inst = emit(FS_OPCODE_TXL, dst);
1313      break;
1314   case ir_txd:
1315   case ir_txf:
1316      assert(!"GLSL 1.30 features unsupported");
1317      break;
1318   }
1319   inst->base_mrf = base_mrf;
1320   inst->mlen = mlen;
1321
1322   if (mlen > 11) {
1323      fail("Message length >11 disallowed by hardware\n");
1324   }
1325
1326   return inst;
1327}
1328
1329void
1330fs_visitor::visit(ir_texture *ir)
1331{
1332   int sampler;
1333   fs_inst *inst = NULL;
1334
1335   ir->coordinate->accept(this);
1336   fs_reg coordinate = this->result;
1337
1338   if (ir->offset != NULL) {
1339      ir_constant *offset = ir->offset->as_constant();
1340      assert(offset != NULL);
1341
1342      signed char offsets[3];
1343      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1344	 offsets[i] = (signed char) offset->value.i[i];
1345
1346      /* Combine all three offsets into a single unsigned dword:
1347       *
1348       *    bits 11:8 - U Offset (X component)
1349       *    bits  7:4 - V Offset (Y component)
1350       *    bits  3:0 - R Offset (Z component)
1351       */
1352      unsigned offset_bits = 0;
1353      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1354	 const unsigned shift = 4 * (2 - i);
1355	 offset_bits |= (offsets[i] << shift) & (0xF << shift);
1356      }
1357
1358      /* Explicitly set up the message header by copying g0 to msg reg m1. */
1359      emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1360	   fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1361
1362      /* Then set the offset bits in DWord 2 of the message header. */
1363      emit(BRW_OPCODE_MOV,
1364	   fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1365			 BRW_REGISTER_TYPE_UD)),
1366	   fs_reg(brw_imm_uw(offset_bits)));
1367   }
1368
1369   /* Should be lowered by do_lower_texture_projection */
1370   assert(!ir->projector);
1371
1372   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1373					     ctx->Shader.CurrentFragmentProgram,
1374					     &brw->fragment_program->Base);
1375   sampler = c->fp->program.Base.SamplerUnits[sampler];
1376
1377   /* The 965 requires the EU to do the normalization of GL rectangle
1378    * texture coordinates.  We use the program parameter state
1379    * tracking to get the scaling factor.
1380    */
1381   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1382      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1383      int tokens[STATE_LENGTH] = {
1384	 STATE_INTERNAL,
1385	 STATE_TEXRECT_SCALE,
1386	 sampler,
1387	 0,
1388	 0
1389      };
1390
1391      c->prog_data.param_convert[c->prog_data.nr_params] =
1392	 PARAM_NO_CONVERT;
1393      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1394	 PARAM_NO_CONVERT;
1395
1396      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1397      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1398      GLuint index = _mesa_add_state_reference(params,
1399					       (gl_state_index *)tokens);
1400
1401      this->param_index[c->prog_data.nr_params] = index;
1402      this->param_offset[c->prog_data.nr_params] = 0;
1403      c->prog_data.nr_params++;
1404      this->param_index[c->prog_data.nr_params] = index;
1405      this->param_offset[c->prog_data.nr_params] = 1;
1406      c->prog_data.nr_params++;
1407
1408      fs_reg dst = fs_reg(this, ir->coordinate->type);
1409      fs_reg src = coordinate;
1410      coordinate = dst;
1411
1412      emit(BRW_OPCODE_MUL, dst, src, scale_x);
1413      dst.reg_offset++;
1414      src.reg_offset++;
1415      emit(BRW_OPCODE_MUL, dst, src, scale_y);
1416   }
1417
1418   /* Writemasking doesn't eliminate channels on SIMD8 texture
1419    * samples, so don't worry about them.
1420    */
1421   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1422
1423   if (intel->gen < 5) {
1424      inst = emit_texture_gen4(ir, dst, coordinate);
1425   } else {
1426      inst = emit_texture_gen5(ir, dst, coordinate);
1427   }
1428
1429   /* If there's an offset, we already set up m1.  To avoid the implied move,
1430    * use the null register.  Otherwise, we want an implied move from g0.
1431    */
1432   if (ir->offset != NULL)
1433      inst->src[0] = fs_reg(brw_null_reg());
1434   else
1435      inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1436
1437   inst->sampler = sampler;
1438
1439   this->result = dst;
1440
1441   if (ir->shadow_comparitor)
1442      inst->shadow_compare = true;
1443
1444   if (ir->type == glsl_type::float_type) {
1445      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1446      assert(ir->sampler->type->sampler_shadow);
1447   } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1448      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1449
1450      for (int i = 0; i < 4; i++) {
1451	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1452	 fs_reg l = swizzle_dst;
1453	 l.reg_offset += i;
1454
1455	 if (swiz == SWIZZLE_ZERO) {
1456	    emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1457	 } else if (swiz == SWIZZLE_ONE) {
1458	    emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1459	 } else {
1460	    fs_reg r = dst;
1461	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1462	    emit(BRW_OPCODE_MOV, l, r);
1463	 }
1464      }
1465      this->result = swizzle_dst;
1466   }
1467}
1468
1469void
1470fs_visitor::visit(ir_swizzle *ir)
1471{
1472   ir->val->accept(this);
1473   fs_reg val = this->result;
1474
1475   if (ir->type->vector_elements == 1) {
1476      this->result.reg_offset += ir->mask.x;
1477      return;
1478   }
1479
1480   fs_reg result = fs_reg(this, ir->type);
1481   this->result = result;
1482
1483   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1484      fs_reg channel = val;
1485      int swiz = 0;
1486
1487      switch (i) {
1488      case 0:
1489	 swiz = ir->mask.x;
1490	 break;
1491      case 1:
1492	 swiz = ir->mask.y;
1493	 break;
1494      case 2:
1495	 swiz = ir->mask.z;
1496	 break;
1497      case 3:
1498	 swiz = ir->mask.w;
1499	 break;
1500      }
1501
1502      channel.reg_offset += swiz;
1503      emit(BRW_OPCODE_MOV, result, channel);
1504      result.reg_offset++;
1505   }
1506}
1507
1508void
1509fs_visitor::visit(ir_discard *ir)
1510{
1511   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1512
1513   assert(ir->condition == NULL); /* FINISHME */
1514
1515   emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d);
1516   emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp);
1517   kill_emitted = true;
1518}
1519
1520void
1521fs_visitor::visit(ir_constant *ir)
1522{
1523   /* Set this->result to reg at the bottom of the function because some code
1524    * paths will cause this visitor to be applied to other fields.  This will
1525    * cause the value stored in this->result to be modified.
1526    *
1527    * Make reg constant so that it doesn't get accidentally modified along the
1528    * way.  Yes, I actually had this problem. :(
1529    */
1530   const fs_reg reg(this, ir->type);
1531   fs_reg dst_reg = reg;
1532
1533   if (ir->type->is_array()) {
1534      const unsigned size = type_size(ir->type->fields.array);
1535
1536      for (unsigned i = 0; i < ir->type->length; i++) {
1537	 ir->array_elements[i]->accept(this);
1538	 fs_reg src_reg = this->result;
1539
1540	 dst_reg.type = src_reg.type;
1541	 for (unsigned j = 0; j < size; j++) {
1542	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1543	    src_reg.reg_offset++;
1544	    dst_reg.reg_offset++;
1545	 }
1546      }
1547   } else if (ir->type->is_record()) {
1548      foreach_list(node, &ir->components) {
1549	 ir_instruction *const field = (ir_instruction *) node;
1550	 const unsigned size = type_size(field->type);
1551
1552	 field->accept(this);
1553	 fs_reg src_reg = this->result;
1554
1555	 dst_reg.type = src_reg.type;
1556	 for (unsigned j = 0; j < size; j++) {
1557	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1558	    src_reg.reg_offset++;
1559	    dst_reg.reg_offset++;
1560	 }
1561      }
1562   } else {
1563      const unsigned size = type_size(ir->type);
1564
1565      for (unsigned i = 0; i < size; i++) {
1566	 switch (ir->type->base_type) {
1567	 case GLSL_TYPE_FLOAT:
1568	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1569	    break;
1570	 case GLSL_TYPE_UINT:
1571	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1572	    break;
1573	 case GLSL_TYPE_INT:
1574	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1575	    break;
1576	 case GLSL_TYPE_BOOL:
1577	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1578	    break;
1579	 default:
1580	    assert(!"Non-float/uint/int/bool constant");
1581	 }
1582	 dst_reg.reg_offset++;
1583      }
1584   }
1585
1586   this->result = reg;
1587}
1588
1589void
1590fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1591{
1592   ir_expression *expr = ir->as_expression();
1593
1594   if (expr) {
1595      fs_reg op[2];
1596      fs_inst *inst;
1597
1598      assert(expr->get_num_operands() <= 2);
1599      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1600	 assert(expr->operands[i]->type->is_scalar());
1601
1602	 expr->operands[i]->accept(this);
1603	 op[i] = this->result;
1604      }
1605
1606      switch (expr->operation) {
1607      case ir_unop_logic_not:
1608	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1609	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1610	 break;
1611
1612      case ir_binop_logic_xor:
1613	 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1614	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1615	 break;
1616
1617      case ir_binop_logic_or:
1618	 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1619	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1620	 break;
1621
1622      case ir_binop_logic_and:
1623	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1624	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1625	 break;
1626
1627      case ir_unop_f2b:
1628	 if (intel->gen >= 6) {
1629	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1630	 } else {
1631	    inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1632	 }
1633	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1634	 break;
1635
1636      case ir_unop_i2b:
1637	 if (intel->gen >= 6) {
1638	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1639	 } else {
1640	    inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1641	 }
1642	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1643	 break;
1644
1645      case ir_binop_greater:
1646      case ir_binop_gequal:
1647      case ir_binop_less:
1648      case ir_binop_lequal:
1649      case ir_binop_equal:
1650      case ir_binop_all_equal:
1651      case ir_binop_nequal:
1652      case ir_binop_any_nequal:
1653	 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1654	 inst->conditional_mod =
1655	    brw_conditional_for_comparison(expr->operation);
1656	 break;
1657
1658      default:
1659	 assert(!"not reached");
1660	 fail("bad cond code\n");
1661	 break;
1662      }
1663      return;
1664   }
1665
1666   ir->accept(this);
1667
1668   if (intel->gen >= 6) {
1669      fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1670      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1671   } else {
1672      fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1673      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1674   }
1675}
1676
1677/**
1678 * Emit a gen6 IF statement with the comparison folded into the IF
1679 * instruction.
1680 */
1681void
1682fs_visitor::emit_if_gen6(ir_if *ir)
1683{
1684   ir_expression *expr = ir->condition->as_expression();
1685
1686   if (expr) {
1687      fs_reg op[2];
1688      fs_inst *inst;
1689      fs_reg temp;
1690
1691      assert(expr->get_num_operands() <= 2);
1692      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1693	 assert(expr->operands[i]->type->is_scalar());
1694
1695	 expr->operands[i]->accept(this);
1696	 op[i] = this->result;
1697      }
1698
1699      switch (expr->operation) {
1700      case ir_unop_logic_not:
1701	 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1702	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1703	 return;
1704
1705      case ir_binop_logic_xor:
1706	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1707	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1708	 return;
1709
1710      case ir_binop_logic_or:
1711	 temp = fs_reg(this, glsl_type::bool_type);
1712	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1713	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1714	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1715	 return;
1716
1717      case ir_binop_logic_and:
1718	 temp = fs_reg(this, glsl_type::bool_type);
1719	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1720	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1721	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1722	 return;
1723
1724      case ir_unop_f2b:
1725	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1726	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1727	 return;
1728
1729      case ir_unop_i2b:
1730	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1731	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1732	 return;
1733
1734      case ir_binop_greater:
1735      case ir_binop_gequal:
1736      case ir_binop_less:
1737      case ir_binop_lequal:
1738      case ir_binop_equal:
1739      case ir_binop_all_equal:
1740      case ir_binop_nequal:
1741      case ir_binop_any_nequal:
1742	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1743	 inst->conditional_mod =
1744	    brw_conditional_for_comparison(expr->operation);
1745	 return;
1746      default:
1747	 assert(!"not reached");
1748	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1749	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1750	 fail("bad condition\n");
1751	 return;
1752      }
1753      return;
1754   }
1755
1756   ir->condition->accept(this);
1757
1758   fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1759   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1760}
1761
1762void
1763fs_visitor::visit(ir_if *ir)
1764{
1765   fs_inst *inst;
1766
1767   if (c->dispatch_width == 16) {
1768      fail("Can't support (non-uniform) control flow on 16-wide\n");
1769   }
1770
1771   /* Don't point the annotation at the if statement, because then it plus
1772    * the then and else blocks get printed.
1773    */
1774   this->base_ir = ir->condition;
1775
1776   if (intel->gen >= 6) {
1777      emit_if_gen6(ir);
1778   } else {
1779      emit_bool_to_cond_code(ir->condition);
1780
1781      inst = emit(BRW_OPCODE_IF);
1782      inst->predicated = true;
1783   }
1784
1785   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1786      ir_instruction *ir = (ir_instruction *)iter.get();
1787      this->base_ir = ir;
1788
1789      ir->accept(this);
1790   }
1791
1792   if (!ir->else_instructions.is_empty()) {
1793      emit(BRW_OPCODE_ELSE);
1794
1795      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1796	 ir_instruction *ir = (ir_instruction *)iter.get();
1797	 this->base_ir = ir;
1798
1799	 ir->accept(this);
1800      }
1801   }
1802
1803   emit(BRW_OPCODE_ENDIF);
1804}
1805
1806void
1807fs_visitor::visit(ir_loop *ir)
1808{
1809   fs_reg counter = reg_undef;
1810
1811   if (c->dispatch_width == 16) {
1812      fail("Can't support (non-uniform) control flow on 16-wide\n");
1813   }
1814
1815   if (ir->counter) {
1816      this->base_ir = ir->counter;
1817      ir->counter->accept(this);
1818      counter = *(variable_storage(ir->counter));
1819
1820      if (ir->from) {
1821	 this->base_ir = ir->from;
1822	 ir->from->accept(this);
1823
1824	 emit(BRW_OPCODE_MOV, counter, this->result);
1825      }
1826   }
1827
1828   emit(BRW_OPCODE_DO);
1829
1830   if (ir->to) {
1831      this->base_ir = ir->to;
1832      ir->to->accept(this);
1833
1834      fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1835      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1836
1837      inst = emit(BRW_OPCODE_BREAK);
1838      inst->predicated = true;
1839   }
1840
1841   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1842      ir_instruction *ir = (ir_instruction *)iter.get();
1843
1844      this->base_ir = ir;
1845      ir->accept(this);
1846   }
1847
1848   if (ir->increment) {
1849      this->base_ir = ir->increment;
1850      ir->increment->accept(this);
1851      emit(BRW_OPCODE_ADD, counter, counter, this->result);
1852   }
1853
1854   emit(BRW_OPCODE_WHILE);
1855}
1856
1857void
1858fs_visitor::visit(ir_loop_jump *ir)
1859{
1860   switch (ir->mode) {
1861   case ir_loop_jump::jump_break:
1862      emit(BRW_OPCODE_BREAK);
1863      break;
1864   case ir_loop_jump::jump_continue:
1865      emit(BRW_OPCODE_CONTINUE);
1866      break;
1867   }
1868}
1869
1870void
1871fs_visitor::visit(ir_call *ir)
1872{
1873   assert(!"FINISHME");
1874}
1875
1876void
1877fs_visitor::visit(ir_return *ir)
1878{
1879   assert(!"FINISHME");
1880}
1881
1882void
1883fs_visitor::visit(ir_function *ir)
1884{
1885   /* Ignore function bodies other than main() -- we shouldn't see calls to
1886    * them since they should all be inlined before we get to ir_to_mesa.
1887    */
1888   if (strcmp(ir->name, "main") == 0) {
1889      const ir_function_signature *sig;
1890      exec_list empty;
1891
1892      sig = ir->matching_signature(&empty);
1893
1894      assert(sig);
1895
1896      foreach_iter(exec_list_iterator, iter, sig->body) {
1897	 ir_instruction *ir = (ir_instruction *)iter.get();
1898	 this->base_ir = ir;
1899
1900	 ir->accept(this);
1901      }
1902   }
1903}
1904
1905void
1906fs_visitor::visit(ir_function_signature *ir)
1907{
1908   assert(!"not reached");
1909   (void)ir;
1910}
1911
1912fs_inst *
1913fs_visitor::emit(fs_inst inst)
1914{
1915   fs_inst *list_inst = new(mem_ctx) fs_inst;
1916   *list_inst = inst;
1917
1918   if (force_uncompressed_stack > 0)
1919      list_inst->force_uncompressed = true;
1920   else if (force_sechalf_stack > 0)
1921      list_inst->force_sechalf = true;
1922
1923   list_inst->annotation = this->current_annotation;
1924   list_inst->ir = this->base_ir;
1925
1926   this->instructions.push_tail(list_inst);
1927
1928   return list_inst;
1929}
1930
1931/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1932void
1933fs_visitor::emit_dummy_fs()
1934{
1935   /* Everyone's favorite color. */
1936   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
1937   emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
1938   emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
1939   emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
1940
1941   fs_inst *write;
1942   write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1943   write->base_mrf = 0;
1944}
1945
1946/* The register location here is relative to the start of the URB
1947 * data.  It will get adjusted to be a real location before
1948 * generate_code() time.
1949 */
1950struct brw_reg
1951fs_visitor::interp_reg(int location, int channel)
1952{
1953   int regnr = urb_setup[location] * 2 + channel / 2;
1954   int stride = (channel & 1) * 4;
1955
1956   assert(urb_setup[location] != -1);
1957
1958   return brw_vec1_grf(regnr, stride);
1959}
1960
1961/** Emits the interpolation for the varying inputs. */
1962void
1963fs_visitor::emit_interpolation_setup_gen4()
1964{
1965   this->current_annotation = "compute pixel centers";
1966   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1967   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1968   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1969   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1970
1971   emit(FS_OPCODE_PIXEL_X, this->pixel_x);
1972   emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
1973
1974   this->current_annotation = "compute pixel deltas from v0";
1975   if (brw->has_pln) {
1976      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1977      this->delta_y = this->delta_x;
1978      this->delta_y.reg_offset++;
1979   } else {
1980      this->delta_x = fs_reg(this, glsl_type::float_type);
1981      this->delta_y = fs_reg(this, glsl_type::float_type);
1982   }
1983   emit(BRW_OPCODE_ADD, this->delta_x,
1984	this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1985   emit(BRW_OPCODE_ADD, this->delta_y,
1986	this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1987
1988   this->current_annotation = "compute pos.w and 1/pos.w";
1989   /* Compute wpos.w.  It's always in our setup, since it's needed to
1990    * interpolate the other attributes.
1991    */
1992   this->wpos_w = fs_reg(this, glsl_type::float_type);
1993   emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1994	interp_reg(FRAG_ATTRIB_WPOS, 3));
1995   /* Compute the pixel 1/W value from wpos.w. */
1996   this->pixel_w = fs_reg(this, glsl_type::float_type);
1997   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1998   this->current_annotation = NULL;
1999}
2000
2001/** Emits the interpolation for the varying inputs. */
2002void
2003fs_visitor::emit_interpolation_setup_gen6()
2004{
2005   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2006
2007   /* If the pixel centers end up used, the setup is the same as for gen4. */
2008   this->current_annotation = "compute pixel centers";
2009   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2010   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2011   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2012   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2013   emit(BRW_OPCODE_ADD,
2014	int_pixel_x,
2015	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2016	fs_reg(brw_imm_v(0x10101010)));
2017   emit(BRW_OPCODE_ADD,
2018	int_pixel_y,
2019	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2020	fs_reg(brw_imm_v(0x11001100)));
2021
2022   /* As of gen6, we can no longer mix float and int sources.  We have
2023    * to turn the integer pixel centers into floats for their actual
2024    * use.
2025    */
2026   this->pixel_x = fs_reg(this, glsl_type::float_type);
2027   this->pixel_y = fs_reg(this, glsl_type::float_type);
2028   emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
2029   emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
2030
2031   this->current_annotation = "compute pos.w";
2032   this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2033   this->wpos_w = fs_reg(this, glsl_type::float_type);
2034   emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w);
2035
2036   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2037   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2038
2039   this->current_annotation = NULL;
2040}
2041
2042void
2043fs_visitor::emit_fb_writes()
2044{
2045   this->current_annotation = "FB write header";
2046   GLboolean header_present = GL_TRUE;
2047   int nr = 0;
2048   int reg_width = c->dispatch_width / 8;
2049
2050   if (intel->gen >= 6 &&
2051       !this->kill_emitted &&
2052       c->key.nr_color_regions == 1) {
2053      header_present = false;
2054   }
2055
2056   if (header_present) {
2057      /* m0, m1 header */
2058      nr += 2;
2059   }
2060
2061   if (c->aa_dest_stencil_reg) {
2062      push_force_uncompressed();
2063      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2064	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2065      pop_force_uncompressed();
2066   }
2067
2068   /* Reserve space for color. It'll be filled in per MRT below. */
2069   int color_mrf = nr;
2070   nr += 4 * reg_width;
2071
2072   if (c->source_depth_to_render_target) {
2073      if (intel->gen == 6 && c->dispatch_width == 16) {
2074	 /* For outputting oDepth on gen6, SIMD8 writes have to be
2075	  * used.  This would require 8-wide moves of each half to
2076	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
2077	  * Just bail on doing so for now.
2078	  */
2079	 fail("Missing support for simd16 depth writes on gen6\n");
2080      }
2081
2082      if (c->computes_depth) {
2083	 /* Hand over gl_FragDepth. */
2084	 assert(this->frag_depth);
2085	 fs_reg depth = *(variable_storage(this->frag_depth));
2086
2087	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
2088      } else {
2089	 /* Pass through the payload depth. */
2090	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2091	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2092      }
2093      nr += reg_width;
2094   }
2095
2096   if (c->dest_depth_reg) {
2097      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2098	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2099      nr += reg_width;
2100   }
2101
2102   fs_reg color = reg_undef;
2103   if (this->frag_color)
2104      color = *(variable_storage(this->frag_color));
2105   else if (this->frag_data) {
2106      color = *(variable_storage(this->frag_data));
2107      color.type = BRW_REGISTER_TYPE_F;
2108   }
2109
2110   for (int target = 0; target < c->key.nr_color_regions; target++) {
2111      this->current_annotation = ralloc_asprintf(this->mem_ctx,
2112						 "FB write target %d",
2113						 target);
2114      if (this->frag_color || this->frag_data) {
2115	 for (int i = 0; i < 4; i++) {
2116	    emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i * reg_width), color);
2117	    color.reg_offset++;
2118	 }
2119      }
2120
2121      if (this->frag_color)
2122	 color.reg_offset -= 4;
2123
2124      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2125      inst->target = target;
2126      inst->base_mrf = 0;
2127      inst->mlen = nr;
2128      if (target == c->key.nr_color_regions - 1)
2129	 inst->eot = true;
2130      inst->header_present = header_present;
2131   }
2132
2133   if (c->key.nr_color_regions == 0) {
2134      if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2135	 /* If the alpha test is enabled but there's no color buffer,
2136	  * we still need to send alpha out the pipeline to our null
2137	  * renderbuffer.
2138	  */
2139	 color.reg_offset += 3;
2140	 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color);
2141      }
2142
2143      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2144      inst->base_mrf = 0;
2145      inst->mlen = nr;
2146      inst->eot = true;
2147      inst->header_present = header_present;
2148   }
2149
2150   this->current_annotation = NULL;
2151}
2152
2153void
2154fs_visitor::generate_fb_write(fs_inst *inst)
2155{
2156   GLboolean eot = inst->eot;
2157   struct brw_reg implied_header;
2158
2159   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2160    * move, here's g1.
2161    */
2162   brw_push_insn_state(p);
2163   brw_set_mask_control(p, BRW_MASK_DISABLE);
2164   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2165
2166   if (inst->header_present) {
2167      if (intel->gen >= 6) {
2168	 brw_MOV(p,
2169		 brw_message_reg(inst->base_mrf),
2170		 brw_vec8_grf(0, 0));
2171
2172	 if (inst->target > 0) {
2173	    /* Set the render target index for choosing BLEND_STATE. */
2174	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2175			      BRW_REGISTER_TYPE_UD),
2176		    brw_imm_ud(inst->target));
2177	 }
2178
2179	 /* Clear viewport index, render target array index. */
2180	 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2181			   BRW_REGISTER_TYPE_UD),
2182		 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2183		 brw_imm_ud(0xf7ff));
2184
2185	 implied_header = brw_null_reg();
2186      } else {
2187	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2188      }
2189
2190      brw_MOV(p,
2191	      brw_message_reg(inst->base_mrf + 1),
2192	      brw_vec8_grf(1, 0));
2193   } else {
2194      implied_header = brw_null_reg();
2195   }
2196
2197   brw_pop_insn_state(p);
2198
2199   brw_fb_WRITE(p,
2200		c->dispatch_width,
2201		inst->base_mrf,
2202		implied_header,
2203		inst->target,
2204		inst->mlen,
2205		0,
2206		eot,
2207		inst->header_present);
2208}
2209
2210/* Computes the integer pixel x,y values from the origin.
2211 *
2212 * This is the basis of gl_FragCoord computation, but is also used
2213 * pre-gen6 for computing the deltas from v0 for computing
2214 * interpolation.
2215 */
2216void
2217fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
2218{
2219   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2220   struct brw_reg src;
2221   struct brw_reg deltas;
2222
2223   if (is_x) {
2224      src = stride(suboffset(g1_uw, 4), 2, 4, 0);
2225      deltas = brw_imm_v(0x10101010);
2226   } else {
2227      src = stride(suboffset(g1_uw, 5), 2, 4, 0);
2228      deltas = brw_imm_v(0x11001100);
2229   }
2230
2231   if (c->dispatch_width == 16) {
2232      dst = vec16(dst);
2233   }
2234
2235   /* We do this 8 or 16-wide, but since the destination is UW we
2236    * don't do compression in the 16-wide case.
2237    */
2238   brw_push_insn_state(p);
2239   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2240   brw_ADD(p, dst, src, deltas);
2241   brw_pop_insn_state(p);
2242}
2243
2244void
2245fs_visitor::generate_linterp(fs_inst *inst,
2246			     struct brw_reg dst, struct brw_reg *src)
2247{
2248   struct brw_reg delta_x = src[0];
2249   struct brw_reg delta_y = src[1];
2250   struct brw_reg interp = src[2];
2251
2252   if (brw->has_pln &&
2253       delta_y.nr == delta_x.nr + 1 &&
2254       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2255      brw_PLN(p, dst, interp, delta_x);
2256   } else {
2257      brw_LINE(p, brw_null_reg(), interp, delta_x);
2258      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2259   }
2260}
2261
2262void
2263fs_visitor::generate_math(fs_inst *inst,
2264			  struct brw_reg dst, struct brw_reg *src)
2265{
2266   int op;
2267
2268   switch (inst->opcode) {
2269   case FS_OPCODE_RCP:
2270      op = BRW_MATH_FUNCTION_INV;
2271      break;
2272   case FS_OPCODE_RSQ:
2273      op = BRW_MATH_FUNCTION_RSQ;
2274      break;
2275   case FS_OPCODE_SQRT:
2276      op = BRW_MATH_FUNCTION_SQRT;
2277      break;
2278   case FS_OPCODE_EXP2:
2279      op = BRW_MATH_FUNCTION_EXP;
2280      break;
2281   case FS_OPCODE_LOG2:
2282      op = BRW_MATH_FUNCTION_LOG;
2283      break;
2284   case FS_OPCODE_POW:
2285      op = BRW_MATH_FUNCTION_POW;
2286      break;
2287   case FS_OPCODE_SIN:
2288      op = BRW_MATH_FUNCTION_SIN;
2289      break;
2290   case FS_OPCODE_COS:
2291      op = BRW_MATH_FUNCTION_COS;
2292      break;
2293   default:
2294      assert(!"not reached: unknown math function");
2295      op = 0;
2296      break;
2297   }
2298
2299   if (intel->gen >= 6) {
2300      assert(inst->mlen == 0);
2301
2302      if (inst->opcode == FS_OPCODE_POW) {
2303	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2304	 brw_math2(p, dst, op, src[0], src[1]);
2305
2306	 if (c->dispatch_width == 16) {
2307	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2308	    brw_math2(p, sechalf(dst), op, sechalf(src[0]), sechalf(src[1]));
2309	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2310	 }
2311      } else {
2312	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2313	 brw_math(p, dst,
2314		  op,
2315		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2316		  BRW_MATH_SATURATE_NONE,
2317		  0, src[0],
2318		  BRW_MATH_DATA_VECTOR,
2319		  BRW_MATH_PRECISION_FULL);
2320
2321	 if (c->dispatch_width == 16) {
2322	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2323	    brw_math(p, sechalf(dst),
2324		     op,
2325		     inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2326		     BRW_MATH_SATURATE_NONE,
2327		     0, sechalf(src[0]),
2328		     BRW_MATH_DATA_VECTOR,
2329		     BRW_MATH_PRECISION_FULL);
2330	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2331	 }
2332      }
2333   } else {
2334      assert(inst->mlen >= 1);
2335
2336      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2337      brw_math(p, dst,
2338	       op,
2339	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2340	       BRW_MATH_SATURATE_NONE,
2341	       inst->base_mrf, src[0],
2342	       BRW_MATH_DATA_VECTOR,
2343	       BRW_MATH_PRECISION_FULL);
2344
2345      if (c->dispatch_width == 16) {
2346	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2347	 brw_math(p, sechalf(dst),
2348		  op,
2349		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2350		  BRW_MATH_SATURATE_NONE,
2351		  inst->base_mrf + 1, sechalf(src[0]),
2352		  BRW_MATH_DATA_VECTOR,
2353		  BRW_MATH_PRECISION_FULL);
2354	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2355      }
2356   }
2357}
2358
2359void
2360fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2361{
2362   int msg_type = -1;
2363   int rlen = 4;
2364   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2365
2366   if (c->dispatch_width == 16) {
2367      rlen = 8;
2368      dst = vec16(dst);
2369      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2370   }
2371
2372   if (intel->gen >= 5) {
2373      switch (inst->opcode) {
2374      case FS_OPCODE_TEX:
2375	 if (inst->shadow_compare) {
2376	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
2377	 } else {
2378	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
2379	 }
2380	 break;
2381      case FS_OPCODE_TXB:
2382	 if (inst->shadow_compare) {
2383	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
2384	 } else {
2385	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
2386	 }
2387	 break;
2388      case FS_OPCODE_TXL:
2389	 if (inst->shadow_compare) {
2390	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
2391	 } else {
2392	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
2393	 }
2394	 break;
2395      case FS_OPCODE_TXD:
2396	 assert(!"TXD isn't supported on gen5+ yet.");
2397	 break;
2398      }
2399   } else {
2400      switch (inst->opcode) {
2401      case FS_OPCODE_TEX:
2402	 /* Note that G45 and older determines shadow compare and dispatch width
2403	  * from message length for most messages.
2404	  */
2405	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2406	 if (inst->shadow_compare) {
2407	    assert(inst->mlen == 6);
2408	 } else {
2409	    assert(inst->mlen <= 4);
2410	 }
2411	 break;
2412      case FS_OPCODE_TXB:
2413	 if (inst->shadow_compare) {
2414	    assert(inst->mlen == 6);
2415	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
2416	 } else {
2417	    assert(inst->mlen == 9);
2418	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2419	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2420	 }
2421	 break;
2422      case FS_OPCODE_TXL:
2423	 if (inst->shadow_compare) {
2424	    assert(inst->mlen == 6);
2425	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
2426	 } else {
2427	    assert(inst->mlen == 9);
2428	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
2429	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2430	 }
2431	 break;
2432      case FS_OPCODE_TXD:
2433	 assert(!"TXD isn't supported on gen4 yet.");
2434	 break;
2435      }
2436   }
2437   assert(msg_type != -1);
2438
2439   brw_SAMPLE(p,
2440	      retype(dst, BRW_REGISTER_TYPE_UW),
2441	      inst->base_mrf,
2442	      src,
2443              SURF_INDEX_TEXTURE(inst->sampler),
2444	      inst->sampler,
2445	      WRITEMASK_XYZW,
2446	      msg_type,
2447	      rlen,
2448	      inst->mlen,
2449	      0,
2450	      1,
2451	      simd_mode);
2452}
2453
2454
2455/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2456 * looking like:
2457 *
2458 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2459 *
2460 * and we're trying to produce:
2461 *
2462 *           DDX                     DDY
2463 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2464 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2465 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2466 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2467 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2468 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2469 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2470 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2471 *
2472 * and add another set of two more subspans if in 16-pixel dispatch mode.
2473 *
2474 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2475 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2476 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2477 * between each other.  We could probably do it like ddx and swizzle the right
2478 * order later, but bail for now and just produce
2479 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2480 */
2481void
2482fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2483{
2484   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2485				 BRW_REGISTER_TYPE_F,
2486				 BRW_VERTICAL_STRIDE_2,
2487				 BRW_WIDTH_2,
2488				 BRW_HORIZONTAL_STRIDE_0,
2489				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2490   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2491				 BRW_REGISTER_TYPE_F,
2492				 BRW_VERTICAL_STRIDE_2,
2493				 BRW_WIDTH_2,
2494				 BRW_HORIZONTAL_STRIDE_0,
2495				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2496   brw_ADD(p, dst, src0, negate(src1));
2497}
2498
2499void
2500fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2501{
2502   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2503				 BRW_REGISTER_TYPE_F,
2504				 BRW_VERTICAL_STRIDE_4,
2505				 BRW_WIDTH_4,
2506				 BRW_HORIZONTAL_STRIDE_0,
2507				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2508   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2509				 BRW_REGISTER_TYPE_F,
2510				 BRW_VERTICAL_STRIDE_4,
2511				 BRW_WIDTH_4,
2512				 BRW_HORIZONTAL_STRIDE_0,
2513				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2514   brw_ADD(p, dst, src0, negate(src1));
2515}
2516
2517void
2518fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2519{
2520   if (intel->gen >= 6) {
2521      /* Gen6 no longer has the mask reg for us to just read the
2522       * active channels from.  However, cmp updates just the channels
2523       * of the flag reg that are enabled, so we can get at the
2524       * channel enables that way.  In this step, make a reg of ones
2525       * we'll compare to.
2526       */
2527      brw_MOV(p, mask, brw_imm_ud(1));
2528   } else {
2529      brw_push_insn_state(p);
2530      brw_set_mask_control(p, BRW_MASK_DISABLE);
2531      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2532      brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2533      brw_pop_insn_state(p);
2534   }
2535}
2536
2537void
2538fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2539{
2540   if (intel->gen >= 6) {
2541      struct brw_reg f0 = brw_flag_reg();
2542      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2543
2544      brw_push_insn_state(p);
2545      brw_set_mask_control(p, BRW_MASK_DISABLE);
2546      brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2547      brw_pop_insn_state(p);
2548
2549      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2550	      BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2551      /* Undo CMP's whacking of predication*/
2552      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2553
2554      brw_push_insn_state(p);
2555      brw_set_mask_control(p, BRW_MASK_DISABLE);
2556      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2557      brw_AND(p, g1, f0, g1);
2558      brw_pop_insn_state(p);
2559   } else {
2560      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2561
2562      mask = brw_uw1_reg(mask.file, mask.nr, 0);
2563
2564      brw_push_insn_state(p);
2565      brw_set_mask_control(p, BRW_MASK_DISABLE);
2566      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2567      brw_AND(p, g0, mask, g0);
2568      brw_pop_insn_state(p);
2569   }
2570}
2571
2572void
2573fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2574{
2575   assert(inst->mlen != 0);
2576
2577   brw_MOV(p,
2578	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2579	   retype(src, BRW_REGISTER_TYPE_UD));
2580   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2581				 inst->offset);
2582}
2583
2584void
2585fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2586{
2587   assert(inst->mlen != 0);
2588
2589   /* Clear any post destination dependencies that would be ignored by
2590    * the block read.  See the B-Spec for pre-gen5 send instruction.
2591    *
2592    * This could use a better solution, since texture sampling and
2593    * math reads could potentially run into it as well -- anywhere
2594    * that we have a SEND with a destination that is a register that
2595    * was written but not read within the last N instructions (what's
2596    * N?  unsure).  This is rare because of dead code elimination, but
2597    * not impossible.
2598    */
2599   if (intel->gen == 4 && !intel->is_g4x)
2600      brw_MOV(p, brw_null_reg(), dst);
2601
2602   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2603				inst->offset);
2604
2605   if (intel->gen == 4 && !intel->is_g4x) {
2606      /* gen4 errata: destination from a send can't be used as a
2607       * destination until it's been read.  Just read it so we don't
2608       * have to worry.
2609       */
2610      brw_MOV(p, brw_null_reg(), dst);
2611   }
2612}
2613
2614
2615void
2616fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2617{
2618   assert(inst->mlen != 0);
2619
2620   /* Clear any post destination dependencies that would be ignored by
2621    * the block read.  See the B-Spec for pre-gen5 send instruction.
2622    *
2623    * This could use a better solution, since texture sampling and
2624    * math reads could potentially run into it as well -- anywhere
2625    * that we have a SEND with a destination that is a register that
2626    * was written but not read within the last N instructions (what's
2627    * N?  unsure).  This is rare because of dead code elimination, but
2628    * not impossible.
2629    */
2630   if (intel->gen == 4 && !intel->is_g4x)
2631      brw_MOV(p, brw_null_reg(), dst);
2632
2633   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2634			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2635
2636   if (intel->gen == 4 && !intel->is_g4x) {
2637      /* gen4 errata: destination from a send can't be used as a
2638       * destination until it's been read.  Just read it so we don't
2639       * have to worry.
2640       */
2641      brw_MOV(p, brw_null_reg(), dst);
2642   }
2643}
2644
2645/**
2646 * To be called after the last _mesa_add_state_reference() call, to
2647 * set up prog_data.param[] for assign_curb_setup() and
2648 * setup_pull_constants().
2649 */
2650void
2651fs_visitor::setup_paramvalues_refs()
2652{
2653   /* Set up the pointers to ParamValues now that that array is finalized. */
2654   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2655      c->prog_data.param[i] =
2656	 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2657	 this->param_offset[i];
2658   }
2659}
2660
2661void
2662fs_visitor::assign_curb_setup()
2663{
2664   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2665   if (c->dispatch_width == 8) {
2666      c->prog_data.first_curbe_grf = c->nr_payload_regs;
2667   } else {
2668      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
2669   }
2670
2671   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2672   foreach_iter(exec_list_iterator, iter, this->instructions) {
2673      fs_inst *inst = (fs_inst *)iter.get();
2674
2675      for (unsigned int i = 0; i < 3; i++) {
2676	 if (inst->src[i].file == UNIFORM) {
2677	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2678	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
2679						  constant_nr / 8,
2680						  constant_nr % 8);
2681
2682	    inst->src[i].file = FIXED_HW_REG;
2683	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2684	 }
2685      }
2686   }
2687}
2688
2689void
2690fs_visitor::calculate_urb_setup()
2691{
2692   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2693      urb_setup[i] = -1;
2694   }
2695
2696   int urb_next = 0;
2697   /* Figure out where each of the incoming setup attributes lands. */
2698   if (intel->gen >= 6) {
2699      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2700	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2701	    urb_setup[i] = urb_next++;
2702	 }
2703      }
2704   } else {
2705      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2706      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2707	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2708	    int fp_index;
2709
2710	    if (i >= VERT_RESULT_VAR0)
2711	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2712	    else if (i <= VERT_RESULT_TEX7)
2713	       fp_index = i;
2714	    else
2715	       fp_index = -1;
2716
2717	    if (fp_index >= 0)
2718	       urb_setup[fp_index] = urb_next++;
2719	 }
2720      }
2721   }
2722
2723   /* Each attribute is 4 setup channels, each of which is half a reg. */
2724   c->prog_data.urb_read_length = urb_next * 2;
2725}
2726
2727void
2728fs_visitor::assign_urb_setup()
2729{
2730   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
2731
2732   /* Offset all the urb_setup[] index by the actual position of the
2733    * setup regs, now that the location of the constants has been chosen.
2734    */
2735   foreach_iter(exec_list_iterator, iter, this->instructions) {
2736      fs_inst *inst = (fs_inst *)iter.get();
2737
2738      if (inst->opcode == FS_OPCODE_LINTERP) {
2739	 assert(inst->src[2].file == FIXED_HW_REG);
2740	 inst->src[2].fixed_hw_reg.nr += urb_start;
2741      }
2742
2743      if (inst->opcode == FS_OPCODE_CINTERP) {
2744	 assert(inst->src[0].file == FIXED_HW_REG);
2745	 inst->src[0].fixed_hw_reg.nr += urb_start;
2746      }
2747   }
2748
2749   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2750}
2751
2752/**
2753 * Split large virtual GRFs into separate components if we can.
2754 *
2755 * This is mostly duplicated with what brw_fs_vector_splitting does,
2756 * but that's really conservative because it's afraid of doing
2757 * splitting that doesn't result in real progress after the rest of
2758 * the optimization phases, which would cause infinite looping in
2759 * optimization.  We can do it once here, safely.  This also has the
2760 * opportunity to split interpolated values, or maybe even uniforms,
2761 * which we don't have at the IR level.
2762 *
2763 * We want to split, because virtual GRFs are what we register
2764 * allocate and spill (due to contiguousness requirements for some
2765 * instructions), and they're what we naturally generate in the
2766 * codegen process, but most virtual GRFs don't actually need to be
2767 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2768 * live intervals and better dead code elimination and coalescing.
2769 */
2770void
2771fs_visitor::split_virtual_grfs()
2772{
2773   int num_vars = this->virtual_grf_next;
2774   bool split_grf[num_vars];
2775   int new_virtual_grf[num_vars];
2776
2777   /* Try to split anything > 0 sized. */
2778   for (int i = 0; i < num_vars; i++) {
2779      if (this->virtual_grf_sizes[i] != 1)
2780	 split_grf[i] = true;
2781      else
2782	 split_grf[i] = false;
2783   }
2784
2785   if (brw->has_pln) {
2786      /* PLN opcodes rely on the delta_xy being contiguous. */
2787      split_grf[this->delta_x.reg] = false;
2788   }
2789
2790   foreach_iter(exec_list_iterator, iter, this->instructions) {
2791      fs_inst *inst = (fs_inst *)iter.get();
2792
2793      /* Texturing produces 4 contiguous registers, so no splitting. */
2794      if (inst->is_tex()) {
2795	 split_grf[inst->dst.reg] = false;
2796      }
2797   }
2798
2799   /* Allocate new space for split regs.  Note that the virtual
2800    * numbers will be contiguous.
2801    */
2802   for (int i = 0; i < num_vars; i++) {
2803      if (split_grf[i]) {
2804	 new_virtual_grf[i] = virtual_grf_alloc(1);
2805	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2806	    int reg = virtual_grf_alloc(1);
2807	    assert(reg == new_virtual_grf[i] + j - 1);
2808	    (void) reg;
2809	 }
2810	 this->virtual_grf_sizes[i] = 1;
2811      }
2812   }
2813
2814   foreach_iter(exec_list_iterator, iter, this->instructions) {
2815      fs_inst *inst = (fs_inst *)iter.get();
2816
2817      if (inst->dst.file == GRF &&
2818	  split_grf[inst->dst.reg] &&
2819	  inst->dst.reg_offset != 0) {
2820	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2821			  inst->dst.reg_offset - 1);
2822	 inst->dst.reg_offset = 0;
2823      }
2824      for (int i = 0; i < 3; i++) {
2825	 if (inst->src[i].file == GRF &&
2826	     split_grf[inst->src[i].reg] &&
2827	     inst->src[i].reg_offset != 0) {
2828	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2829				inst->src[i].reg_offset - 1);
2830	    inst->src[i].reg_offset = 0;
2831	 }
2832      }
2833   }
2834   this->live_intervals_valid = false;
2835}
2836
2837/**
2838 * Choose accesses from the UNIFORM file to demote to using the pull
2839 * constant buffer.
2840 *
2841 * We allow a fragment shader to have more than the specified minimum
2842 * maximum number of fragment shader uniform components (64).  If
2843 * there are too many of these, they'd fill up all of register space.
2844 * So, this will push some of them out to the pull constant buffer and
2845 * update the program to load them.
2846 */
2847void
2848fs_visitor::setup_pull_constants()
2849{
2850   /* Only allow 16 registers (128 uniform components) as push constants. */
2851   unsigned int max_uniform_components = 16 * 8;
2852   if (c->prog_data.nr_params <= max_uniform_components)
2853      return;
2854
2855   /* Just demote the end of the list.  We could probably do better
2856    * here, demoting things that are rarely used in the program first.
2857    */
2858   int pull_uniform_base = max_uniform_components;
2859   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2860
2861   foreach_iter(exec_list_iterator, iter, this->instructions) {
2862      fs_inst *inst = (fs_inst *)iter.get();
2863
2864      for (int i = 0; i < 3; i++) {
2865	 if (inst->src[i].file != UNIFORM)
2866	    continue;
2867
2868	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2869	 if (uniform_nr < pull_uniform_base)
2870	    continue;
2871
2872	 fs_reg dst = fs_reg(this, glsl_type::float_type);
2873	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2874					      dst);
2875	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2876	 pull->ir = inst->ir;
2877	 pull->annotation = inst->annotation;
2878	 pull->base_mrf = 14;
2879	 pull->mlen = 1;
2880
2881	 inst->insert_before(pull);
2882
2883	 inst->src[i].file = GRF;
2884	 inst->src[i].reg = dst.reg;
2885	 inst->src[i].reg_offset = 0;
2886	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2887      }
2888   }
2889
2890   for (int i = 0; i < pull_uniform_count; i++) {
2891      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2892      c->prog_data.pull_param_convert[i] =
2893	 c->prog_data.param_convert[pull_uniform_base + i];
2894   }
2895   c->prog_data.nr_params -= pull_uniform_count;
2896   c->prog_data.nr_pull_params = pull_uniform_count;
2897}
2898
2899void
2900fs_visitor::calculate_live_intervals()
2901{
2902   int num_vars = this->virtual_grf_next;
2903   int *def = ralloc_array(mem_ctx, int, num_vars);
2904   int *use = ralloc_array(mem_ctx, int, num_vars);
2905   int loop_depth = 0;
2906   int loop_start = 0;
2907   int bb_header_ip = 0;
2908
2909   if (this->live_intervals_valid)
2910      return;
2911
2912   for (int i = 0; i < num_vars; i++) {
2913      def[i] = MAX_INSTRUCTION;
2914      use[i] = -1;
2915   }
2916
2917   int ip = 0;
2918   foreach_iter(exec_list_iterator, iter, this->instructions) {
2919      fs_inst *inst = (fs_inst *)iter.get();
2920
2921      if (inst->opcode == BRW_OPCODE_DO) {
2922	 if (loop_depth++ == 0)
2923	    loop_start = ip;
2924      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2925	 loop_depth--;
2926
2927	 if (loop_depth == 0) {
2928	    /* Patches up the use of vars marked for being live across
2929	     * the whole loop.
2930	     */
2931	    for (int i = 0; i < num_vars; i++) {
2932	       if (use[i] == loop_start) {
2933		  use[i] = ip;
2934	       }
2935	    }
2936	 }
2937      } else {
2938	 for (unsigned int i = 0; i < 3; i++) {
2939	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2940	       int reg = inst->src[i].reg;
2941
2942	       if (!loop_depth) {
2943		  use[reg] = ip;
2944	       } else {
2945		  def[reg] = MIN2(loop_start, def[reg]);
2946		  use[reg] = loop_start;
2947
2948		  /* Nobody else is going to go smash our start to
2949		   * later in the loop now, because def[reg] now
2950		   * points before the bb header.
2951		   */
2952	       }
2953	    }
2954	 }
2955	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2956	    int reg = inst->dst.reg;
2957
2958	    if (!loop_depth) {
2959	       def[reg] = MIN2(def[reg], ip);
2960	    } else {
2961	       def[reg] = MIN2(def[reg], loop_start);
2962	    }
2963	 }
2964      }
2965
2966      ip++;
2967
2968      /* Set the basic block header IP.  This is used for determining
2969       * if a complete def of single-register virtual GRF in a loop
2970       * dominates a use in the same basic block.  It's a quick way to
2971       * reduce the live interval range of most register used in a
2972       * loop.
2973       */
2974      if (inst->opcode == BRW_OPCODE_IF ||
2975	  inst->opcode == BRW_OPCODE_ELSE ||
2976	  inst->opcode == BRW_OPCODE_ENDIF ||
2977	  inst->opcode == BRW_OPCODE_DO ||
2978	  inst->opcode == BRW_OPCODE_WHILE ||
2979	  inst->opcode == BRW_OPCODE_BREAK ||
2980	  inst->opcode == BRW_OPCODE_CONTINUE) {
2981	 bb_header_ip = ip;
2982      }
2983   }
2984
2985   ralloc_free(this->virtual_grf_def);
2986   ralloc_free(this->virtual_grf_use);
2987   this->virtual_grf_def = def;
2988   this->virtual_grf_use = use;
2989
2990   this->live_intervals_valid = true;
2991}
2992
2993/**
2994 * Attempts to move immediate constants into the immediate
2995 * constant slot of following instructions.
2996 *
2997 * Immediate constants are a bit tricky -- they have to be in the last
2998 * operand slot, you can't do abs/negate on them,
2999 */
3000
3001bool
3002fs_visitor::propagate_constants()
3003{
3004   bool progress = false;
3005
3006   /* Need to update the MRF tracking for compressed instructions. */
3007   if (c->dispatch_width == 16)
3008      return false;
3009
3010   calculate_live_intervals();
3011
3012   foreach_iter(exec_list_iterator, iter, this->instructions) {
3013      fs_inst *inst = (fs_inst *)iter.get();
3014
3015      if (inst->opcode != BRW_OPCODE_MOV ||
3016	  inst->predicated ||
3017	  inst->dst.file != GRF || inst->src[0].file != IMM ||
3018	  inst->dst.type != inst->src[0].type)
3019	 continue;
3020
3021      /* Don't bother with cases where we should have had the
3022       * operation on the constant folded in GLSL already.
3023       */
3024      if (inst->saturate)
3025	 continue;
3026
3027      /* Found a move of a constant to a GRF.  Find anything else using the GRF
3028       * before it's written, and replace it with the constant if we can.
3029       */
3030      exec_list_iterator scan_iter = iter;
3031      scan_iter.next();
3032      for (; scan_iter.has_next(); scan_iter.next()) {
3033	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3034
3035	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3036	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3037	     scan_inst->opcode == BRW_OPCODE_ELSE ||
3038	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3039	    break;
3040	 }
3041
3042	 for (int i = 2; i >= 0; i--) {
3043	    if (scan_inst->src[i].file != GRF ||
3044		scan_inst->src[i].reg != inst->dst.reg ||
3045		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
3046	       continue;
3047
3048	    /* Don't bother with cases where we should have had the
3049	     * operation on the constant folded in GLSL already.
3050	     */
3051	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
3052	       continue;
3053
3054	    switch (scan_inst->opcode) {
3055	    case BRW_OPCODE_MOV:
3056	       scan_inst->src[i] = inst->src[0];
3057	       progress = true;
3058	       break;
3059
3060	    case BRW_OPCODE_MUL:
3061	    case BRW_OPCODE_ADD:
3062	       if (i == 1) {
3063		  scan_inst->src[i] = inst->src[0];
3064		  progress = true;
3065	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
3066		  /* Fit this constant in by commuting the operands */
3067		  scan_inst->src[0] = scan_inst->src[1];
3068		  scan_inst->src[1] = inst->src[0];
3069		  progress = true;
3070	       }
3071	       break;
3072
3073	    case BRW_OPCODE_CMP:
3074	       if (i == 1) {
3075		  scan_inst->src[i] = inst->src[0];
3076		  progress = true;
3077	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
3078		  uint32_t new_cmod;
3079
3080		  new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
3081		  if (new_cmod != ~0u) {
3082		     /* Fit this constant in by swapping the operands and
3083		      * flipping the test
3084		      */
3085		     scan_inst->src[0] = scan_inst->src[1];
3086		     scan_inst->src[1] = inst->src[0];
3087		     scan_inst->conditional_mod = new_cmod;
3088		     progress = true;
3089		  }
3090	       }
3091	       break;
3092
3093	    case BRW_OPCODE_SEL:
3094	       if (i == 1) {
3095		  scan_inst->src[i] = inst->src[0];
3096		  progress = true;
3097	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
3098		  /* Fit this constant in by swapping the operands and
3099		   * flipping the predicate
3100		   */
3101		  scan_inst->src[0] = scan_inst->src[1];
3102		  scan_inst->src[1] = inst->src[0];
3103		  scan_inst->predicate_inverse = !scan_inst->predicate_inverse;
3104		  progress = true;
3105	       }
3106	       break;
3107	    }
3108	 }
3109
3110	 if (scan_inst->dst.file == GRF &&
3111	     scan_inst->dst.reg == inst->dst.reg &&
3112	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3113	      scan_inst->is_tex())) {
3114	    break;
3115	 }
3116      }
3117   }
3118
3119   if (progress)
3120       this->live_intervals_valid = false;
3121
3122   return progress;
3123}
3124/**
3125 * Must be called after calculate_live_intervales() to remove unused
3126 * writes to registers -- register allocation will fail otherwise
3127 * because something deffed but not used won't be considered to
3128 * interfere with other regs.
3129 */
3130bool
3131fs_visitor::dead_code_eliminate()
3132{
3133   bool progress = false;
3134   int pc = 0;
3135
3136   calculate_live_intervals();
3137
3138   foreach_iter(exec_list_iterator, iter, this->instructions) {
3139      fs_inst *inst = (fs_inst *)iter.get();
3140
3141      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
3142	 inst->remove();
3143	 progress = true;
3144      }
3145
3146      pc++;
3147   }
3148
3149   if (progress)
3150      live_intervals_valid = false;
3151
3152   return progress;
3153}
3154
3155bool
3156fs_visitor::register_coalesce()
3157{
3158   bool progress = false;
3159   int if_depth = 0;
3160   int loop_depth = 0;
3161
3162   foreach_iter(exec_list_iterator, iter, this->instructions) {
3163      fs_inst *inst = (fs_inst *)iter.get();
3164
3165      /* Make sure that we dominate the instructions we're going to
3166       * scan for interfering with our coalescing, or we won't have
3167       * scanned enough to see if anything interferes with our
3168       * coalescing.  We don't dominate the following instructions if
3169       * we're in a loop or an if block.
3170       */
3171      switch (inst->opcode) {
3172      case BRW_OPCODE_DO:
3173	 loop_depth++;
3174	 break;
3175      case BRW_OPCODE_WHILE:
3176	 loop_depth--;
3177	 break;
3178      case BRW_OPCODE_IF:
3179	 if_depth++;
3180	 break;
3181      case BRW_OPCODE_ENDIF:
3182	 if_depth--;
3183	 break;
3184      }
3185      if (loop_depth || if_depth)
3186	 continue;
3187
3188      if (inst->opcode != BRW_OPCODE_MOV ||
3189	  inst->predicated ||
3190	  inst->saturate ||
3191	  inst->dst.file != GRF || inst->src[0].file != GRF ||
3192	  inst->dst.type != inst->src[0].type)
3193	 continue;
3194
3195      bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
3196
3197      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
3198       * them: check for no writes to either one until the exit of the
3199       * program.
3200       */
3201      bool interfered = false;
3202      exec_list_iterator scan_iter = iter;
3203      scan_iter.next();
3204      for (; scan_iter.has_next(); scan_iter.next()) {
3205	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3206
3207	 if (scan_inst->dst.file == GRF) {
3208	    if (scan_inst->dst.reg == inst->dst.reg &&
3209		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3210		 scan_inst->is_tex())) {
3211	       interfered = true;
3212	       break;
3213	    }
3214	    if (scan_inst->dst.reg == inst->src[0].reg &&
3215		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3216		 scan_inst->is_tex())) {
3217	       interfered = true;
3218	       break;
3219	    }
3220	 }
3221
3222	 /* The gen6 MATH instruction can't handle source modifiers, so avoid
3223	  * coalescing those for now.  We should do something more specific.
3224	  */
3225	 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) {
3226	    interfered = true;
3227	    break;
3228	 }
3229      }
3230      if (interfered) {
3231	 continue;
3232      }
3233
3234      /* Rewrite the later usage to point at the source of the move to
3235       * be removed.
3236       */
3237      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3238	   scan_iter.next()) {
3239	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3240
3241	 for (int i = 0; i < 3; i++) {
3242	    if (scan_inst->src[i].file == GRF &&
3243		scan_inst->src[i].reg == inst->dst.reg &&
3244		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3245	       scan_inst->src[i].reg = inst->src[0].reg;
3246	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3247	       scan_inst->src[i].abs |= inst->src[0].abs;
3248	       scan_inst->src[i].negate ^= inst->src[0].negate;
3249	       scan_inst->src[i].smear = inst->src[0].smear;
3250	    }
3251	 }
3252      }
3253
3254      inst->remove();
3255      progress = true;
3256   }
3257
3258   if (progress)
3259      live_intervals_valid = false;
3260
3261   return progress;
3262}
3263
3264
3265bool
3266fs_visitor::compute_to_mrf()
3267{
3268   bool progress = false;
3269   int next_ip = 0;
3270
3271   /* Need to update the MRF tracking for compressed instructions. */
3272   if (c->dispatch_width == 16)
3273      return false;
3274
3275   calculate_live_intervals();
3276
3277   foreach_iter(exec_list_iterator, iter, this->instructions) {
3278      fs_inst *inst = (fs_inst *)iter.get();
3279
3280      int ip = next_ip;
3281      next_ip++;
3282
3283      if (inst->opcode != BRW_OPCODE_MOV ||
3284	  inst->predicated ||
3285	  inst->dst.file != MRF || inst->src[0].file != GRF ||
3286	  inst->dst.type != inst->src[0].type ||
3287	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3288	 continue;
3289
3290      /* Can't compute-to-MRF this GRF if someone else was going to
3291       * read it later.
3292       */
3293      if (this->virtual_grf_use[inst->src[0].reg] > ip)
3294	 continue;
3295
3296      /* Found a move of a GRF to a MRF.  Let's see if we can go
3297       * rewrite the thing that made this GRF to write into the MRF.
3298       */
3299      fs_inst *scan_inst;
3300      for (scan_inst = (fs_inst *)inst->prev;
3301	   scan_inst->prev != NULL;
3302	   scan_inst = (fs_inst *)scan_inst->prev) {
3303	 if (scan_inst->dst.file == GRF &&
3304	     scan_inst->dst.reg == inst->src[0].reg) {
3305	    /* Found the last thing to write our reg we want to turn
3306	     * into a compute-to-MRF.
3307	     */
3308
3309	    if (scan_inst->is_tex()) {
3310	       /* texturing writes several continuous regs, so we can't
3311		* compute-to-mrf that.
3312		*/
3313	       break;
3314	    }
3315
3316	    /* If it's predicated, it (probably) didn't populate all
3317	     * the channels.
3318	     */
3319	    if (scan_inst->predicated)
3320	       break;
3321
3322	    /* SEND instructions can't have MRF as a destination. */
3323	    if (scan_inst->mlen)
3324	       break;
3325
3326	    if (intel->gen >= 6) {
3327	       /* gen6 math instructions must have the destination be
3328		* GRF, so no compute-to-MRF for them.
3329		*/
3330	       if (scan_inst->is_math()) {
3331		  break;
3332	       }
3333	    }
3334
3335	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3336	       /* Found the creator of our MRF's source value. */
3337	       scan_inst->dst.file = MRF;
3338	       scan_inst->dst.hw_reg = inst->dst.hw_reg;
3339	       scan_inst->saturate |= inst->saturate;
3340	       inst->remove();
3341	       progress = true;
3342	    }
3343	    break;
3344	 }
3345
3346	 /* We don't handle flow control here.  Most computation of
3347	  * values that end up in MRFs are shortly before the MRF
3348	  * write anyway.
3349	  */
3350	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3351	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3352	     scan_inst->opcode == BRW_OPCODE_ELSE ||
3353	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3354	    break;
3355	 }
3356
3357	 /* You can't read from an MRF, so if someone else reads our
3358	  * MRF's source GRF that we wanted to rewrite, that stops us.
3359	  */
3360	 bool interfered = false;
3361	 for (int i = 0; i < 3; i++) {
3362	    if (scan_inst->src[i].file == GRF &&
3363		scan_inst->src[i].reg == inst->src[0].reg &&
3364		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3365	       interfered = true;
3366	    }
3367	 }
3368	 if (interfered)
3369	    break;
3370
3371	 if (scan_inst->dst.file == MRF &&
3372	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3373	    /* Somebody else wrote our MRF here, so we can't can't
3374	     * compute-to-MRF before that.
3375	     */
3376	    break;
3377	 }
3378
3379	 if (scan_inst->mlen > 0) {
3380	    /* Found a SEND instruction, which means that there are
3381	     * live values in MRFs from base_mrf to base_mrf +
3382	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3383	     * above it.
3384	     */
3385	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3386		inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3387	       break;
3388	    }
3389	 }
3390      }
3391   }
3392
3393   return progress;
3394}
3395
3396/**
3397 * Walks through basic blocks, locking for repeated MRF writes and
3398 * removing the later ones.
3399 */
3400bool
3401fs_visitor::remove_duplicate_mrf_writes()
3402{
3403   fs_inst *last_mrf_move[16];
3404   bool progress = false;
3405
3406   /* Need to update the MRF tracking for compressed instructions. */
3407   if (c->dispatch_width == 16)
3408      return false;
3409
3410   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3411
3412   foreach_iter(exec_list_iterator, iter, this->instructions) {
3413      fs_inst *inst = (fs_inst *)iter.get();
3414
3415      switch (inst->opcode) {
3416      case BRW_OPCODE_DO:
3417      case BRW_OPCODE_WHILE:
3418      case BRW_OPCODE_IF:
3419      case BRW_OPCODE_ELSE:
3420      case BRW_OPCODE_ENDIF:
3421	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3422	 continue;
3423      default:
3424	 break;
3425      }
3426
3427      if (inst->opcode == BRW_OPCODE_MOV &&
3428	  inst->dst.file == MRF) {
3429	 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3430	 if (prev_inst && inst->equals(prev_inst)) {
3431	    inst->remove();
3432	    progress = true;
3433	    continue;
3434	 }
3435      }
3436
3437      /* Clear out the last-write records for MRFs that were overwritten. */
3438      if (inst->dst.file == MRF) {
3439	 last_mrf_move[inst->dst.hw_reg] = NULL;
3440      }
3441
3442      if (inst->mlen > 0) {
3443	 /* Found a SEND instruction, which will include two or fewer
3444	  * implied MRF writes.  We could do better here.
3445	  */
3446	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3447	    last_mrf_move[inst->base_mrf + i] = NULL;
3448	 }
3449      }
3450
3451      /* Clear out any MRF move records whose sources got overwritten. */
3452      if (inst->dst.file == GRF) {
3453	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3454	    if (last_mrf_move[i] &&
3455		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3456	       last_mrf_move[i] = NULL;
3457	    }
3458	 }
3459      }
3460
3461      if (inst->opcode == BRW_OPCODE_MOV &&
3462	  inst->dst.file == MRF &&
3463	  inst->src[0].file == GRF &&
3464	  !inst->predicated) {
3465	 last_mrf_move[inst->dst.hw_reg] = inst;
3466      }
3467   }
3468
3469   return progress;
3470}
3471
3472bool
3473fs_visitor::virtual_grf_interferes(int a, int b)
3474{
3475   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3476   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3477
3478   /* We can't handle dead register writes here, without iterating
3479    * over the whole instruction stream to find every single dead
3480    * write to that register to compare to the live interval of the
3481    * other register.  Just assert that dead_code_eliminate() has been
3482    * called.
3483    */
3484   assert((this->virtual_grf_use[a] != -1 ||
3485	   this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3486	  (this->virtual_grf_use[b] != -1 ||
3487	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
3488
3489   /* If the register is used to store 16 values of less than float
3490    * size (only the case for pixel_[xy]), then we can't allocate
3491    * another dword-sized thing to that register that would be used in
3492    * the same instruction.  This is because when the GPU decodes (for
3493    * example):
3494    *
3495    * (declare (in ) vec4 gl_FragCoord@0x97766a0)
3496    * add(16)         g6<1>F          g6<8,8,1>UW     0.5F { align1 compr };
3497    *
3498    * it's actually processed as:
3499    * add(8)         g6<1>F          g6<8,8,1>UW     0.5F { align1 };
3500    * add(8)         g7<1>F          g6.8<8,8,1>UW   0.5F { align1 sechalf };
3501    *
3502    * so our second half values in g6 got overwritten in the first
3503    * half.
3504    */
3505   if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
3506				   this->pixel_x.reg == b ||
3507				   this->pixel_y.reg == a ||
3508				   this->pixel_y.reg == b)) {
3509      return start <= end;
3510   }
3511
3512   return start < end;
3513}
3514
3515static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3516{
3517   struct brw_reg brw_reg;
3518
3519   switch (reg->file) {
3520   case GRF:
3521   case ARF:
3522   case MRF:
3523      if (reg->smear == -1) {
3524	 brw_reg = brw_vec8_reg(reg->file,
3525				reg->hw_reg, 0);
3526      } else {
3527	 brw_reg = brw_vec1_reg(reg->file,
3528				reg->hw_reg, reg->smear);
3529      }
3530      brw_reg = retype(brw_reg, reg->type);
3531      break;
3532   case IMM:
3533      switch (reg->type) {
3534      case BRW_REGISTER_TYPE_F:
3535	 brw_reg = brw_imm_f(reg->imm.f);
3536	 break;
3537      case BRW_REGISTER_TYPE_D:
3538	 brw_reg = brw_imm_d(reg->imm.i);
3539	 break;
3540      case BRW_REGISTER_TYPE_UD:
3541	 brw_reg = brw_imm_ud(reg->imm.u);
3542	 break;
3543      default:
3544	 assert(!"not reached");
3545	 brw_reg = brw_null_reg();
3546	 break;
3547      }
3548      break;
3549   case FIXED_HW_REG:
3550      brw_reg = reg->fixed_hw_reg;
3551      break;
3552   case BAD_FILE:
3553      /* Probably unused. */
3554      brw_reg = brw_null_reg();
3555      break;
3556   case UNIFORM:
3557      assert(!"not reached");
3558      brw_reg = brw_null_reg();
3559      break;
3560   default:
3561      assert(!"not reached");
3562      brw_reg = brw_null_reg();
3563      break;
3564   }
3565   if (reg->abs)
3566      brw_reg = brw_abs(brw_reg);
3567   if (reg->negate)
3568      brw_reg = negate(brw_reg);
3569
3570   return brw_reg;
3571}
3572
3573void
3574fs_visitor::generate_code()
3575{
3576   int last_native_inst = p->nr_insn;
3577   const char *last_annotation_string = NULL;
3578   ir_instruction *last_annotation_ir = NULL;
3579
3580   int if_stack_array_size = 16;
3581   int loop_stack_array_size = 16;
3582   int if_stack_depth = 0, loop_stack_depth = 0;
3583   brw_instruction **if_stack =
3584      rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size);
3585   brw_instruction **loop_stack =
3586      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
3587   int *if_depth_in_loop =
3588      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
3589
3590
3591   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3592      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
3593	     ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
3594   }
3595
3596   foreach_iter(exec_list_iterator, iter, this->instructions) {
3597      fs_inst *inst = (fs_inst *)iter.get();
3598      struct brw_reg src[3], dst;
3599
3600      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3601	 if (last_annotation_ir != inst->ir) {
3602	    last_annotation_ir = inst->ir;
3603	    if (last_annotation_ir) {
3604	       printf("   ");
3605	       last_annotation_ir->print();
3606	       printf("\n");
3607	    }
3608	 }
3609	 if (last_annotation_string != inst->annotation) {
3610	    last_annotation_string = inst->annotation;
3611	    if (last_annotation_string)
3612	       printf("   %s\n", last_annotation_string);
3613	 }
3614      }
3615
3616      for (unsigned int i = 0; i < 3; i++) {
3617	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3618      }
3619      dst = brw_reg_from_fs_reg(&inst->dst);
3620
3621      brw_set_conditionalmod(p, inst->conditional_mod);
3622      brw_set_predicate_control(p, inst->predicated);
3623      brw_set_predicate_inverse(p, inst->predicate_inverse);
3624      brw_set_saturate(p, inst->saturate);
3625
3626      if (inst->force_uncompressed || c->dispatch_width == 8) {
3627	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
3628      } else if (inst->force_sechalf) {
3629	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
3630      } else {
3631	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
3632      }
3633
3634      switch (inst->opcode) {
3635      case BRW_OPCODE_MOV:
3636	 brw_MOV(p, dst, src[0]);
3637	 break;
3638      case BRW_OPCODE_ADD:
3639	 brw_ADD(p, dst, src[0], src[1]);
3640	 break;
3641      case BRW_OPCODE_MUL:
3642	 brw_MUL(p, dst, src[0], src[1]);
3643	 break;
3644
3645      case BRW_OPCODE_FRC:
3646	 brw_FRC(p, dst, src[0]);
3647	 break;
3648      case BRW_OPCODE_RNDD:
3649	 brw_RNDD(p, dst, src[0]);
3650	 break;
3651      case BRW_OPCODE_RNDE:
3652	 brw_RNDE(p, dst, src[0]);
3653	 break;
3654      case BRW_OPCODE_RNDZ:
3655	 brw_RNDZ(p, dst, src[0]);
3656	 break;
3657
3658      case BRW_OPCODE_AND:
3659	 brw_AND(p, dst, src[0], src[1]);
3660	 break;
3661      case BRW_OPCODE_OR:
3662	 brw_OR(p, dst, src[0], src[1]);
3663	 break;
3664      case BRW_OPCODE_XOR:
3665	 brw_XOR(p, dst, src[0], src[1]);
3666	 break;
3667      case BRW_OPCODE_NOT:
3668	 brw_NOT(p, dst, src[0]);
3669	 break;
3670      case BRW_OPCODE_ASR:
3671	 brw_ASR(p, dst, src[0], src[1]);
3672	 break;
3673      case BRW_OPCODE_SHR:
3674	 brw_SHR(p, dst, src[0], src[1]);
3675	 break;
3676      case BRW_OPCODE_SHL:
3677	 brw_SHL(p, dst, src[0], src[1]);
3678	 break;
3679
3680      case BRW_OPCODE_CMP:
3681	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3682	 break;
3683      case BRW_OPCODE_SEL:
3684	 brw_SEL(p, dst, src[0], src[1]);
3685	 break;
3686
3687      case BRW_OPCODE_IF:
3688	 if (inst->src[0].file != BAD_FILE) {
3689	    assert(intel->gen >= 6);
3690	    if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]);
3691	 } else {
3692	    if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3693	 }
3694	 if_depth_in_loop[loop_stack_depth]++;
3695	 if_stack_depth++;
3696	 if (if_stack_array_size <= if_stack_depth) {
3697	    if_stack_array_size *= 2;
3698	    if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *,
3699			        if_stack_array_size);
3700	 }
3701	 break;
3702
3703      case BRW_OPCODE_ELSE:
3704	 if_stack[if_stack_depth - 1] =
3705	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
3706	 break;
3707      case BRW_OPCODE_ENDIF:
3708	 if_stack_depth--;
3709	 brw_ENDIF(p , if_stack[if_stack_depth]);
3710	 if_depth_in_loop[loop_stack_depth]--;
3711	 break;
3712
3713      case BRW_OPCODE_DO:
3714	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3715	 if (loop_stack_array_size <= loop_stack_depth) {
3716	    loop_stack_array_size *= 2;
3717	    loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
3718				  loop_stack_array_size);
3719	    if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
3720				        loop_stack_array_size);
3721	 }
3722	 if_depth_in_loop[loop_stack_depth] = 0;
3723	 break;
3724
3725      case BRW_OPCODE_BREAK:
3726	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3727	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3728	 break;
3729      case BRW_OPCODE_CONTINUE:
3730	 /* FINISHME: We need to write the loop instruction support still. */
3731	 if (intel->gen >= 6)
3732	    gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
3733	 else
3734	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3735	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3736	 break;
3737
3738      case BRW_OPCODE_WHILE: {
3739	 struct brw_instruction *inst0, *inst1;
3740	 GLuint br = 1;
3741
3742	 if (intel->gen >= 5)
3743	    br = 2;
3744
3745	 assert(loop_stack_depth > 0);
3746	 loop_stack_depth--;
3747	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3748	 if (intel->gen < 6) {
3749	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
3750	    while (inst0 > loop_stack[loop_stack_depth]) {
3751	       inst0--;
3752	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3753		   inst0->bits3.if_else.jump_count == 0) {
3754		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3755	    }
3756	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3757			inst0->bits3.if_else.jump_count == 0) {
3758		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3759	       }
3760	    }
3761	 }
3762      }
3763	 break;
3764
3765      case FS_OPCODE_RCP:
3766      case FS_OPCODE_RSQ:
3767      case FS_OPCODE_SQRT:
3768      case FS_OPCODE_EXP2:
3769      case FS_OPCODE_LOG2:
3770      case FS_OPCODE_POW:
3771      case FS_OPCODE_SIN:
3772      case FS_OPCODE_COS:
3773	 generate_math(inst, dst, src);
3774	 break;
3775      case FS_OPCODE_PIXEL_X:
3776	 generate_pixel_xy(dst, true);
3777	 break;
3778      case FS_OPCODE_PIXEL_Y:
3779	 generate_pixel_xy(dst, false);
3780	 break;
3781      case FS_OPCODE_CINTERP:
3782	 brw_MOV(p, dst, src[0]);
3783	 break;
3784      case FS_OPCODE_LINTERP:
3785	 generate_linterp(inst, dst, src);
3786	 break;
3787      case FS_OPCODE_TEX:
3788      case FS_OPCODE_TXB:
3789      case FS_OPCODE_TXD:
3790      case FS_OPCODE_TXL:
3791	 generate_tex(inst, dst, src[0]);
3792	 break;
3793      case FS_OPCODE_DISCARD_NOT:
3794	 generate_discard_not(inst, dst);
3795	 break;
3796      case FS_OPCODE_DISCARD_AND:
3797	 generate_discard_and(inst, src[0]);
3798	 break;
3799      case FS_OPCODE_DDX:
3800	 generate_ddx(inst, dst, src[0]);
3801	 break;
3802      case FS_OPCODE_DDY:
3803	 generate_ddy(inst, dst, src[0]);
3804	 break;
3805
3806      case FS_OPCODE_SPILL:
3807	 generate_spill(inst, src[0]);
3808	 break;
3809
3810      case FS_OPCODE_UNSPILL:
3811	 generate_unspill(inst, dst);
3812	 break;
3813
3814      case FS_OPCODE_PULL_CONSTANT_LOAD:
3815	 generate_pull_constant_load(inst, dst);
3816	 break;
3817
3818      case FS_OPCODE_FB_WRITE:
3819	 generate_fb_write(inst);
3820	 break;
3821      default:
3822	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3823	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3824			  brw_opcodes[inst->opcode].name);
3825	 } else {
3826	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3827	 }
3828	 fail("unsupported opcode in FS\n");
3829      }
3830
3831      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3832	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3833	    if (0) {
3834	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3835		      ((uint32_t *)&p->store[i])[3],
3836		      ((uint32_t *)&p->store[i])[2],
3837		      ((uint32_t *)&p->store[i])[1],
3838		      ((uint32_t *)&p->store[i])[0]);
3839	    }
3840	    brw_disasm(stdout, &p->store[i], intel->gen);
3841	 }
3842      }
3843
3844      last_native_inst = p->nr_insn;
3845   }
3846
3847   ralloc_free(if_stack);
3848   ralloc_free(loop_stack);
3849   ralloc_free(if_depth_in_loop);
3850
3851   brw_set_uip_jip(p);
3852
3853   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3854    * emit issues, it doesn't get the jump distances into the output,
3855    * which is often something we want to debug.  So this is here in
3856    * case you're doing that.
3857    */
3858   if (0) {
3859      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3860	 for (unsigned int i = 0; i < p->nr_insn; i++) {
3861	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3862		   ((uint32_t *)&p->store[i])[3],
3863		   ((uint32_t *)&p->store[i])[2],
3864		   ((uint32_t *)&p->store[i])[1],
3865		   ((uint32_t *)&p->store[i])[0]);
3866	    brw_disasm(stdout, &p->store[i], intel->gen);
3867	 }
3868      }
3869   }
3870}
3871
3872bool
3873fs_visitor::run()
3874{
3875   uint32_t prog_offset_16 = 0;
3876
3877   brw_wm_payload_setup(brw, c);
3878
3879   if (c->dispatch_width == 16) {
3880      if (c->prog_data.curb_read_length) {
3881	 /* Haven't hooked in support for uniforms through the 16-wide
3882	  * version yet.
3883	  */
3884	 return GL_FALSE;
3885      }
3886
3887      /* align to 64 byte boundary. */
3888      while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
3889	 brw_NOP(p);
3890      }
3891
3892      /* Save off the start of this 16-wide program in case we succeed. */
3893      prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
3894
3895      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
3896   }
3897
3898   if (0) {
3899      emit_dummy_fs();
3900   } else {
3901      calculate_urb_setup();
3902      if (intel->gen < 6)
3903	 emit_interpolation_setup_gen4();
3904      else
3905	 emit_interpolation_setup_gen6();
3906
3907      /* Generate FS IR for main().  (the visitor only descends into
3908       * functions called "main").
3909       */
3910      foreach_iter(exec_list_iterator, iter, *shader->ir) {
3911	 ir_instruction *ir = (ir_instruction *)iter.get();
3912	 base_ir = ir;
3913	 ir->accept(this);
3914      }
3915
3916      emit_fb_writes();
3917
3918      split_virtual_grfs();
3919
3920      setup_paramvalues_refs();
3921      setup_pull_constants();
3922
3923      bool progress;
3924      do {
3925	 progress = false;
3926
3927	 progress = remove_duplicate_mrf_writes() || progress;
3928
3929	 progress = propagate_constants() || progress;
3930	 progress = register_coalesce() || progress;
3931	 progress = compute_to_mrf() || progress;
3932	 progress = dead_code_eliminate() || progress;
3933      } while (progress);
3934
3935      schedule_instructions();
3936
3937      assign_curb_setup();
3938      assign_urb_setup();
3939
3940      if (0) {
3941	 /* Debug of register spilling: Go spill everything. */
3942	 int virtual_grf_count = virtual_grf_next;
3943	 for (int i = 1; i < virtual_grf_count; i++) {
3944	    spill_reg(i);
3945	 }
3946      }
3947
3948      if (0)
3949	 assign_regs_trivial();
3950      else {
3951	 while (!assign_regs()) {
3952	    if (failed)
3953	       break;
3954	 }
3955      }
3956   }
3957   assert(force_uncompressed_stack == 0);
3958   assert(force_sechalf_stack == 0);
3959
3960   if (!failed)
3961      generate_code();
3962
3963   if (failed)
3964      return GL_FALSE;
3965
3966   if (c->dispatch_width == 8) {
3967      c->prog_data.total_grf = grf_used;
3968   } else {
3969      c->prog_data.total_grf_16 = grf_used;
3970      c->prog_data.prog_offset_16 = prog_offset_16;
3971   }
3972
3973   return !failed;
3974}
3975
3976bool
3977brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3978{
3979   struct intel_context *intel = &brw->intel;
3980   struct gl_context *ctx = &intel->ctx;
3981   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3982
3983   if (!prog)
3984      return false;
3985
3986   struct brw_shader *shader =
3987     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3988   if (!shader)
3989      return false;
3990
3991   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3992      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3993      _mesa_print_ir(shader->ir, NULL);
3994      printf("\n");
3995   }
3996
3997   /* Now the main event: Visit the shader IR and generate our FS IR for it.
3998    */
3999   c->dispatch_width = 8;
4000
4001   fs_visitor v(c, shader);
4002   if (!v.run()) {
4003      /* FINISHME: Cleanly fail, test at link time, etc. */
4004      assert(!"not reached");
4005      return false;
4006   }
4007
4008   if (intel->gen >= 6) {
4009      c->dispatch_width = 16;
4010      fs_visitor v2(c, shader);
4011      v2.run();
4012   }
4013
4014   c->prog_data.dispatch_width = 8;
4015
4016   return true;
4017}
4018