brw_fs.cpp revision ff6e3c73f6553cd29b915497b5b00e3ef158a27d
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44}
45#include "brw_fs.h"
46#include "../glsl/glsl_types.h"
47#include "../glsl/ir_optimization.h"
48#include "../glsl/ir_print_visitor.h"
49
50#define MAX_INSTRUCTION (1 << 30)
51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53struct gl_shader *
54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55{
56   struct brw_shader *shader;
57
58   shader = rzalloc(NULL, struct brw_shader);
59   if (shader) {
60      shader->base.Type = type;
61      shader->base.Name = name;
62      _mesa_init_shader(ctx, &shader->base);
63   }
64
65   return &shader->base;
66}
67
68struct gl_shader_program *
69brw_new_shader_program(struct gl_context *ctx, GLuint name)
70{
71   struct brw_shader_program *prog;
72   prog = rzalloc(NULL, struct brw_shader_program);
73   if (prog) {
74      prog->base.Name = name;
75      _mesa_init_shader_program(ctx, &prog->base);
76   }
77   return &prog->base;
78}
79
80GLboolean
81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
82{
83   struct brw_context *brw = brw_context(ctx);
84   struct intel_context *intel = &brw->intel;
85
86   struct brw_shader *shader =
87      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
88   if (shader != NULL) {
89      void *mem_ctx = ralloc_context(NULL);
90      bool progress;
91
92      if (shader->ir)
93	 ralloc_free(shader->ir);
94      shader->ir = new(shader) exec_list;
95      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
96
97      do_mat_op_to_vec(shader->ir);
98      lower_instructions(shader->ir,
99			 MOD_TO_FRACT |
100			 DIV_TO_MUL_RCP |
101			 SUB_TO_ADD_NEG |
102			 EXP_TO_EXP2 |
103			 LOG_TO_LOG2);
104
105      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
106       * if-statements need to be flattened.
107       */
108      if (intel->gen < 6)
109	 lower_if_to_cond_assign(shader->ir, 16);
110
111      do_lower_texture_projection(shader->ir);
112      do_vec_index_to_cond_assign(shader->ir);
113      brw_do_cubemap_normalize(shader->ir);
114      lower_noise(shader->ir);
115      lower_quadop_vector(shader->ir, false);
116      lower_variable_index_to_cond_assign(shader->ir,
117					  GL_TRUE, /* input */
118					  GL_TRUE, /* output */
119					  GL_TRUE, /* temp */
120					  GL_TRUE /* uniform */
121					  );
122
123      do {
124	 progress = false;
125
126	 brw_do_channel_expressions(shader->ir);
127	 brw_do_vector_splitting(shader->ir);
128
129	 progress = do_lower_jumps(shader->ir, true, true,
130				   true, /* main return */
131				   false, /* continue */
132				   false /* loops */
133				   ) || progress;
134
135	 progress = do_common_optimization(shader->ir, true, 32) || progress;
136      } while (progress);
137
138      validate_ir_tree(shader->ir);
139
140      reparent_ir(shader->ir, shader->ir);
141      ralloc_free(mem_ctx);
142   }
143
144   if (!_mesa_ir_link_shader(ctx, prog))
145      return GL_FALSE;
146
147   return GL_TRUE;
148}
149
150static int
151type_size(const struct glsl_type *type)
152{
153   unsigned int size, i;
154
155   switch (type->base_type) {
156   case GLSL_TYPE_UINT:
157   case GLSL_TYPE_INT:
158   case GLSL_TYPE_FLOAT:
159   case GLSL_TYPE_BOOL:
160      return type->components();
161   case GLSL_TYPE_ARRAY:
162      return type_size(type->fields.array) * type->length;
163   case GLSL_TYPE_STRUCT:
164      size = 0;
165      for (i = 0; i < type->length; i++) {
166	 size += type_size(type->fields.structure[i].type);
167      }
168      return size;
169   case GLSL_TYPE_SAMPLER:
170      /* Samplers take up no register space, since they're baked in at
171       * link time.
172       */
173      return 0;
174   default:
175      assert(!"not reached");
176      return 0;
177   }
178}
179
180void
181fs_visitor::fail(const char *format, ...)
182{
183   if (!failed) {
184      failed = true;
185
186      if (INTEL_DEBUG & DEBUG_WM) {
187	 fprintf(stderr, "FS compile failed: ");
188
189	 va_list va;
190	 va_start(va, format);
191	 vfprintf(stderr, format, va);
192	 va_end(va);
193      }
194   }
195}
196
197void
198fs_visitor::push_force_uncompressed()
199{
200   force_uncompressed_stack++;
201}
202
203void
204fs_visitor::pop_force_uncompressed()
205{
206   force_uncompressed_stack--;
207   assert(force_uncompressed_stack >= 0);
208}
209
210void
211fs_visitor::push_force_sechalf()
212{
213   force_sechalf_stack++;
214}
215
216void
217fs_visitor::pop_force_sechalf()
218{
219   force_sechalf_stack--;
220   assert(force_sechalf_stack >= 0);
221}
222
223/**
224 * Returns how many MRFs an FS opcode will write over.
225 *
226 * Note that this is not the 0 or 1 implied writes in an actual gen
227 * instruction -- the FS opcodes often generate MOVs in addition.
228 */
229int
230fs_visitor::implied_mrf_writes(fs_inst *inst)
231{
232   if (inst->mlen == 0)
233      return 0;
234
235   switch (inst->opcode) {
236   case FS_OPCODE_RCP:
237   case FS_OPCODE_RSQ:
238   case FS_OPCODE_SQRT:
239   case FS_OPCODE_EXP2:
240   case FS_OPCODE_LOG2:
241   case FS_OPCODE_SIN:
242   case FS_OPCODE_COS:
243      return 1 * c->dispatch_width / 8;
244   case FS_OPCODE_POW:
245      return 2 * c->dispatch_width / 8;
246   case FS_OPCODE_TEX:
247   case FS_OPCODE_TXB:
248   case FS_OPCODE_TXD:
249   case FS_OPCODE_TXL:
250      return 1;
251   case FS_OPCODE_FB_WRITE:
252      return 2;
253   case FS_OPCODE_PULL_CONSTANT_LOAD:
254   case FS_OPCODE_UNSPILL:
255      return 1;
256   case FS_OPCODE_SPILL:
257      return 2;
258   default:
259      assert(!"not reached");
260      return inst->mlen;
261   }
262}
263
264int
265fs_visitor::virtual_grf_alloc(int size)
266{
267   if (virtual_grf_array_size <= virtual_grf_next) {
268      if (virtual_grf_array_size == 0)
269	 virtual_grf_array_size = 16;
270      else
271	 virtual_grf_array_size *= 2;
272      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
273				   virtual_grf_array_size);
274
275      /* This slot is always unused. */
276      virtual_grf_sizes[0] = 0;
277   }
278   virtual_grf_sizes[virtual_grf_next] = size;
279   return virtual_grf_next++;
280}
281
282/** Fixed HW reg constructor. */
283fs_reg::fs_reg(enum register_file file, int hw_reg)
284{
285   init();
286   this->file = file;
287   this->hw_reg = hw_reg;
288   this->type = BRW_REGISTER_TYPE_F;
289}
290
291/** Fixed HW reg constructor. */
292fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
293{
294   init();
295   this->file = file;
296   this->hw_reg = hw_reg;
297   this->type = type;
298}
299
300int
301brw_type_for_base_type(const struct glsl_type *type)
302{
303   switch (type->base_type) {
304   case GLSL_TYPE_FLOAT:
305      return BRW_REGISTER_TYPE_F;
306   case GLSL_TYPE_INT:
307   case GLSL_TYPE_BOOL:
308      return BRW_REGISTER_TYPE_D;
309   case GLSL_TYPE_UINT:
310      return BRW_REGISTER_TYPE_UD;
311   case GLSL_TYPE_ARRAY:
312   case GLSL_TYPE_STRUCT:
313   case GLSL_TYPE_SAMPLER:
314      /* These should be overridden with the type of the member when
315       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
316       * way to trip up if we don't.
317       */
318      return BRW_REGISTER_TYPE_UD;
319   default:
320      assert(!"not reached");
321      return BRW_REGISTER_TYPE_F;
322   }
323}
324
325/** Automatic reg constructor. */
326fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
327{
328   init();
329
330   this->file = GRF;
331   this->reg = v->virtual_grf_alloc(type_size(type));
332   this->reg_offset = 0;
333   this->type = brw_type_for_base_type(type);
334}
335
336fs_reg *
337fs_visitor::variable_storage(ir_variable *var)
338{
339   return (fs_reg *)hash_table_find(this->variable_ht, var);
340}
341
342void
343import_uniforms_callback(const void *key,
344			 void *data,
345			 void *closure)
346{
347   struct hash_table *dst_ht = (struct hash_table *)closure;
348   const fs_reg *reg = (const fs_reg *)data;
349
350   if (reg->file != UNIFORM)
351      return;
352
353   hash_table_insert(dst_ht, data, key);
354}
355
356/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
357 * This brings in those uniform definitions
358 */
359void
360fs_visitor::import_uniforms(struct hash_table *src_variable_ht)
361{
362   hash_table_call_foreach(src_variable_ht,
363			   import_uniforms_callback,
364			   variable_ht);
365}
366
367/* Our support for uniforms is piggy-backed on the struct
368 * gl_fragment_program, because that's where the values actually
369 * get stored, rather than in some global gl_shader_program uniform
370 * store.
371 */
372int
373fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
374{
375   unsigned int offset = 0;
376
377   if (type->is_matrix()) {
378      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
379							type->vector_elements,
380							1);
381
382      for (unsigned int i = 0; i < type->matrix_columns; i++) {
383	 offset += setup_uniform_values(loc + offset, column);
384      }
385
386      return offset;
387   }
388
389   switch (type->base_type) {
390   case GLSL_TYPE_FLOAT:
391   case GLSL_TYPE_UINT:
392   case GLSL_TYPE_INT:
393   case GLSL_TYPE_BOOL:
394      for (unsigned int i = 0; i < type->vector_elements; i++) {
395	 unsigned int param = c->prog_data.nr_params++;
396
397	 assert(param < ARRAY_SIZE(c->prog_data.param));
398
399	 switch (type->base_type) {
400	 case GLSL_TYPE_FLOAT:
401	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
402	    break;
403	 case GLSL_TYPE_UINT:
404	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
405	    break;
406	 case GLSL_TYPE_INT:
407	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
408	    break;
409	 case GLSL_TYPE_BOOL:
410	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
411	    break;
412	 default:
413	    assert(!"not reached");
414	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
415	    break;
416	 }
417	 this->param_index[param] = loc;
418	 this->param_offset[param] = i;
419      }
420      return 1;
421
422   case GLSL_TYPE_STRUCT:
423      for (unsigned int i = 0; i < type->length; i++) {
424	 offset += setup_uniform_values(loc + offset,
425					type->fields.structure[i].type);
426      }
427      return offset;
428
429   case GLSL_TYPE_ARRAY:
430      for (unsigned int i = 0; i < type->length; i++) {
431	 offset += setup_uniform_values(loc + offset, type->fields.array);
432      }
433      return offset;
434
435   case GLSL_TYPE_SAMPLER:
436      /* The sampler takes up a slot, but we don't use any values from it. */
437      return 1;
438
439   default:
440      assert(!"not reached");
441      return 0;
442   }
443}
444
445
446/* Our support for builtin uniforms is even scarier than non-builtin.
447 * It sits on top of the PROG_STATE_VAR parameters that are
448 * automatically updated from GL context state.
449 */
450void
451fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
452{
453   const ir_state_slot *const slots = ir->state_slots;
454   assert(ir->state_slots != NULL);
455
456   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
457      /* This state reference has already been setup by ir_to_mesa, but we'll
458       * get the same index back here.
459       */
460      int index = _mesa_add_state_reference(this->fp->Base.Parameters,
461					    (gl_state_index *)slots[i].tokens);
462
463      /* Add each of the unique swizzles of the element as a parameter.
464       * This'll end up matching the expected layout of the
465       * array/matrix/structure we're trying to fill in.
466       */
467      int last_swiz = -1;
468      for (unsigned int j = 0; j < 4; j++) {
469	 int swiz = GET_SWZ(slots[i].swizzle, j);
470	 if (swiz == last_swiz)
471	    break;
472	 last_swiz = swiz;
473
474	 c->prog_data.param_convert[c->prog_data.nr_params] =
475	    PARAM_NO_CONVERT;
476	 this->param_index[c->prog_data.nr_params] = index;
477	 this->param_offset[c->prog_data.nr_params] = swiz;
478	 c->prog_data.nr_params++;
479      }
480   }
481}
482
483fs_reg *
484fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
485{
486   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
487   fs_reg wpos = *reg;
488   fs_reg neg_y = this->pixel_y;
489   neg_y.negate = true;
490   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
491
492   /* gl_FragCoord.x */
493   if (ir->pixel_center_integer) {
494      emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
495   } else {
496      emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
497   }
498   wpos.reg_offset++;
499
500   /* gl_FragCoord.y */
501   if (!flip && ir->pixel_center_integer) {
502      emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
503   } else {
504      fs_reg pixel_y = this->pixel_y;
505      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
506
507      if (flip) {
508	 pixel_y.negate = true;
509	 offset += c->key.drawable_height - 1.0;
510      }
511
512      emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
513   }
514   wpos.reg_offset++;
515
516   /* gl_FragCoord.z */
517   if (intel->gen >= 6) {
518      emit(BRW_OPCODE_MOV, wpos,
519	   fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
520   } else {
521      emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
522	   interp_reg(FRAG_ATTRIB_WPOS, 2));
523   }
524   wpos.reg_offset++;
525
526   /* gl_FragCoord.w: Already set up in emit_interpolation */
527   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
528
529   return reg;
530}
531
532fs_reg *
533fs_visitor::emit_general_interpolation(ir_variable *ir)
534{
535   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
536   /* Interpolation is always in floating point regs. */
537   reg->type = BRW_REGISTER_TYPE_F;
538   fs_reg attr = *reg;
539
540   unsigned int array_elements;
541   const glsl_type *type;
542
543   if (ir->type->is_array()) {
544      array_elements = ir->type->length;
545      if (array_elements == 0) {
546	 fail("dereferenced array '%s' has length 0\n", ir->name);
547      }
548      type = ir->type->fields.array;
549   } else {
550      array_elements = 1;
551      type = ir->type;
552   }
553
554   int location = ir->location;
555   for (unsigned int i = 0; i < array_elements; i++) {
556      for (unsigned int j = 0; j < type->matrix_columns; j++) {
557	 if (urb_setup[location] == -1) {
558	    /* If there's no incoming setup data for this slot, don't
559	     * emit interpolation for it.
560	     */
561	    attr.reg_offset += type->vector_elements;
562	    location++;
563	    continue;
564	 }
565
566	 bool is_gl_Color =
567	    location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
568
569	 if (c->key.flat_shade && is_gl_Color) {
570	    /* Constant interpolation (flat shading) case. The SF has
571	     * handed us defined values in only the constant offset
572	     * field of the setup reg.
573	     */
574	    for (unsigned int k = 0; k < type->vector_elements; k++) {
575	       struct brw_reg interp = interp_reg(location, k);
576	       interp = suboffset(interp, 3);
577	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
578	       attr.reg_offset++;
579	    }
580	 } else {
581	    /* Perspective interpolation case. */
582	    for (unsigned int k = 0; k < type->vector_elements; k++) {
583	       struct brw_reg interp = interp_reg(location, k);
584	       emit(FS_OPCODE_LINTERP, attr,
585		    this->delta_x, this->delta_y, fs_reg(interp));
586	       attr.reg_offset++;
587	    }
588
589	    if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) {
590	       attr.reg_offset -= type->vector_elements;
591	       for (unsigned int k = 0; k < type->vector_elements; k++) {
592		  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
593		  attr.reg_offset++;
594	       }
595	    }
596	 }
597	 location++;
598      }
599   }
600
601   return reg;
602}
603
604fs_reg *
605fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
606{
607   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
608
609   /* The frontfacing comes in as a bit in the thread payload. */
610   if (intel->gen >= 6) {
611      emit(BRW_OPCODE_ASR, *reg,
612	   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
613	   fs_reg(15));
614      emit(BRW_OPCODE_NOT, *reg, *reg);
615      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
616   } else {
617      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
618      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
619       * us front face
620       */
621      fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
622			   fs_reg(r1_6ud),
623			   fs_reg(1u << 31));
624      inst->conditional_mod = BRW_CONDITIONAL_L;
625      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
626   }
627
628   return reg;
629}
630
631fs_inst *
632fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
633{
634   switch (opcode) {
635   case FS_OPCODE_RCP:
636   case FS_OPCODE_RSQ:
637   case FS_OPCODE_SQRT:
638   case FS_OPCODE_EXP2:
639   case FS_OPCODE_LOG2:
640   case FS_OPCODE_SIN:
641   case FS_OPCODE_COS:
642      break;
643   default:
644      assert(!"not reached: bad math opcode");
645      return NULL;
646   }
647
648   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
649    * might be able to do better by doing execsize = 1 math and then
650    * expanding that result out, but we would need to be careful with
651    * masking.
652    *
653    * The hardware ignores source modifiers (negate and abs) on math
654    * instructions, so we also move to a temp to set those up.
655    */
656   if (intel->gen >= 6 && (src.file == UNIFORM ||
657			   src.abs ||
658			   src.negate)) {
659      fs_reg expanded = fs_reg(this, glsl_type::float_type);
660      emit(BRW_OPCODE_MOV, expanded, src);
661      src = expanded;
662   }
663
664   fs_inst *inst = emit(opcode, dst, src);
665
666   if (intel->gen < 6) {
667      inst->base_mrf = 2;
668      inst->mlen = c->dispatch_width / 8;
669   }
670
671   return inst;
672}
673
674fs_inst *
675fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
676{
677   int base_mrf = 2;
678   fs_inst *inst;
679
680   assert(opcode == FS_OPCODE_POW);
681
682   if (intel->gen >= 6) {
683      /* Can't do hstride == 0 args to gen6 math, so expand it out.
684       *
685       * The hardware ignores source modifiers (negate and abs) on math
686       * instructions, so we also move to a temp to set those up.
687       */
688      if (src0.file == UNIFORM || src0.abs || src0.negate) {
689	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
690	 emit(BRW_OPCODE_MOV, expanded, src0);
691	 src0 = expanded;
692      }
693
694      if (src1.file == UNIFORM || src1.abs || src1.negate) {
695	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
696	 emit(BRW_OPCODE_MOV, expanded, src1);
697	 src1 = expanded;
698      }
699
700      inst = emit(opcode, dst, src0, src1);
701   } else {
702      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
703      inst = emit(opcode, dst, src0, reg_null_f);
704
705      inst->base_mrf = base_mrf;
706      inst->mlen = 2 * c->dispatch_width / 8;
707   }
708   return inst;
709}
710
711void
712fs_visitor::visit(ir_variable *ir)
713{
714   fs_reg *reg = NULL;
715
716   if (variable_storage(ir))
717      return;
718
719   if (strcmp(ir->name, "gl_FragColor") == 0) {
720      this->frag_color = ir;
721   } else if (strcmp(ir->name, "gl_FragData") == 0) {
722      this->frag_data = ir;
723   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
724      this->frag_depth = ir;
725   }
726
727   if (ir->mode == ir_var_in) {
728      if (!strcmp(ir->name, "gl_FragCoord")) {
729	 reg = emit_fragcoord_interpolation(ir);
730      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
731	 reg = emit_frontfacing_interpolation(ir);
732      } else {
733	 reg = emit_general_interpolation(ir);
734      }
735      assert(reg);
736      hash_table_insert(this->variable_ht, reg, ir);
737      return;
738   }
739
740   if (ir->mode == ir_var_uniform) {
741      int param_index = c->prog_data.nr_params;
742
743      if (c->dispatch_width == 16) {
744	 if (!variable_storage(ir)) {
745	    fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
746	 }
747	 return;
748      }
749
750      if (!strncmp(ir->name, "gl_", 3)) {
751	 setup_builtin_uniform_values(ir);
752      } else {
753	 setup_uniform_values(ir->location, ir->type);
754      }
755
756      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
757      reg->type = brw_type_for_base_type(ir->type);
758   }
759
760   if (!reg)
761      reg = new(this->mem_ctx) fs_reg(this, ir->type);
762
763   hash_table_insert(this->variable_ht, reg, ir);
764}
765
766void
767fs_visitor::visit(ir_dereference_variable *ir)
768{
769   fs_reg *reg = variable_storage(ir->var);
770   this->result = *reg;
771}
772
773void
774fs_visitor::visit(ir_dereference_record *ir)
775{
776   const glsl_type *struct_type = ir->record->type;
777
778   ir->record->accept(this);
779
780   unsigned int offset = 0;
781   for (unsigned int i = 0; i < struct_type->length; i++) {
782      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
783	 break;
784      offset += type_size(struct_type->fields.structure[i].type);
785   }
786   this->result.reg_offset += offset;
787   this->result.type = brw_type_for_base_type(ir->type);
788}
789
790void
791fs_visitor::visit(ir_dereference_array *ir)
792{
793   ir_constant *index;
794   int element_size;
795
796   ir->array->accept(this);
797   index = ir->array_index->as_constant();
798
799   element_size = type_size(ir->type);
800   this->result.type = brw_type_for_base_type(ir->type);
801
802   if (index) {
803      assert(this->result.file == UNIFORM ||
804	     (this->result.file == GRF &&
805	      this->result.reg != 0));
806      this->result.reg_offset += index->value.i[0] * element_size;
807   } else {
808      assert(!"FINISHME: non-constant array element");
809   }
810}
811
812/* Instruction selection: Produce a MOV.sat instead of
813 * MIN(MAX(val, 0), 1) when possible.
814 */
815bool
816fs_visitor::try_emit_saturate(ir_expression *ir)
817{
818   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
819
820   if (!sat_val)
821      return false;
822
823   sat_val->accept(this);
824   fs_reg src = this->result;
825
826   this->result = fs_reg(this, ir->type);
827   fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
828   inst->saturate = true;
829
830   return true;
831}
832
833static uint32_t
834brw_conditional_for_comparison(unsigned int op)
835{
836   switch (op) {
837   case ir_binop_less:
838      return BRW_CONDITIONAL_L;
839   case ir_binop_greater:
840      return BRW_CONDITIONAL_G;
841   case ir_binop_lequal:
842      return BRW_CONDITIONAL_LE;
843   case ir_binop_gequal:
844      return BRW_CONDITIONAL_GE;
845   case ir_binop_equal:
846   case ir_binop_all_equal: /* same as equal for scalars */
847      return BRW_CONDITIONAL_Z;
848   case ir_binop_nequal:
849   case ir_binop_any_nequal: /* same as nequal for scalars */
850      return BRW_CONDITIONAL_NZ;
851   default:
852      assert(!"not reached: bad operation for comparison");
853      return BRW_CONDITIONAL_NZ;
854   }
855}
856
857void
858fs_visitor::visit(ir_expression *ir)
859{
860   unsigned int operand;
861   fs_reg op[2], temp;
862   fs_inst *inst;
863
864   assert(ir->get_num_operands() <= 2);
865
866   if (try_emit_saturate(ir))
867      return;
868
869   for (operand = 0; operand < ir->get_num_operands(); operand++) {
870      ir->operands[operand]->accept(this);
871      if (this->result.file == BAD_FILE) {
872	 ir_print_visitor v;
873	 fail("Failed to get tree for expression operand:\n");
874	 ir->operands[operand]->accept(&v);
875      }
876      op[operand] = this->result;
877
878      /* Matrix expression operands should have been broken down to vector
879       * operations already.
880       */
881      assert(!ir->operands[operand]->type->is_matrix());
882      /* And then those vector operands should have been broken down to scalar.
883       */
884      assert(!ir->operands[operand]->type->is_vector());
885   }
886
887   /* Storage for our result.  If our result goes into an assignment, it will
888    * just get copy-propagated out, so no worries.
889    */
890   this->result = fs_reg(this, ir->type);
891
892   switch (ir->operation) {
893   case ir_unop_logic_not:
894      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
895       * ones complement of the whole register, not just bit 0.
896       */
897      emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
898      break;
899   case ir_unop_neg:
900      op[0].negate = !op[0].negate;
901      this->result = op[0];
902      break;
903   case ir_unop_abs:
904      op[0].abs = true;
905      op[0].negate = false;
906      this->result = op[0];
907      break;
908   case ir_unop_sign:
909      temp = fs_reg(this, ir->type);
910
911      emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
912
913      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
914      inst->conditional_mod = BRW_CONDITIONAL_G;
915      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
916      inst->predicated = true;
917
918      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
919      inst->conditional_mod = BRW_CONDITIONAL_L;
920      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
921      inst->predicated = true;
922
923      break;
924   case ir_unop_rcp:
925      emit_math(FS_OPCODE_RCP, this->result, op[0]);
926      break;
927
928   case ir_unop_exp2:
929      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
930      break;
931   case ir_unop_log2:
932      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
933      break;
934   case ir_unop_exp:
935   case ir_unop_log:
936      assert(!"not reached: should be handled by ir_explog_to_explog2");
937      break;
938   case ir_unop_sin:
939   case ir_unop_sin_reduced:
940      emit_math(FS_OPCODE_SIN, this->result, op[0]);
941      break;
942   case ir_unop_cos:
943   case ir_unop_cos_reduced:
944      emit_math(FS_OPCODE_COS, this->result, op[0]);
945      break;
946
947   case ir_unop_dFdx:
948      emit(FS_OPCODE_DDX, this->result, op[0]);
949      break;
950   case ir_unop_dFdy:
951      emit(FS_OPCODE_DDY, this->result, op[0]);
952      break;
953
954   case ir_binop_add:
955      emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
956      break;
957   case ir_binop_sub:
958      assert(!"not reached: should be handled by ir_sub_to_add_neg");
959      break;
960
961   case ir_binop_mul:
962      emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
963      break;
964   case ir_binop_div:
965      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
966      break;
967   case ir_binop_mod:
968      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
969      break;
970
971   case ir_binop_less:
972   case ir_binop_greater:
973   case ir_binop_lequal:
974   case ir_binop_gequal:
975   case ir_binop_equal:
976   case ir_binop_all_equal:
977   case ir_binop_nequal:
978   case ir_binop_any_nequal:
979      temp = this->result;
980      /* original gen4 does implicit conversion before comparison. */
981      if (intel->gen < 5)
982	 temp.type = op[0].type;
983
984      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
985      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
986      emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
987      break;
988
989   case ir_binop_logic_xor:
990      emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
991      break;
992
993   case ir_binop_logic_or:
994      emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
995      break;
996
997   case ir_binop_logic_and:
998      emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
999      break;
1000
1001   case ir_binop_dot:
1002   case ir_unop_any:
1003      assert(!"not reached: should be handled by brw_fs_channel_expressions");
1004      break;
1005
1006   case ir_unop_noise:
1007      assert(!"not reached: should be handled by lower_noise");
1008      break;
1009
1010   case ir_quadop_vector:
1011      assert(!"not reached: should be handled by lower_quadop_vector");
1012      break;
1013
1014   case ir_unop_sqrt:
1015      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
1016      break;
1017
1018   case ir_unop_rsq:
1019      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
1020      break;
1021
1022   case ir_unop_i2f:
1023   case ir_unop_b2f:
1024   case ir_unop_b2i:
1025   case ir_unop_f2i:
1026      emit(BRW_OPCODE_MOV, this->result, op[0]);
1027      break;
1028   case ir_unop_f2b:
1029   case ir_unop_i2b:
1030      temp = this->result;
1031      /* original gen4 does implicit conversion before comparison. */
1032      if (intel->gen < 5)
1033	 temp.type = op[0].type;
1034
1035      inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
1036      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1037      inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
1038      break;
1039
1040   case ir_unop_trunc:
1041      emit(BRW_OPCODE_RNDZ, this->result, op[0]);
1042      break;
1043   case ir_unop_ceil:
1044      op[0].negate = !op[0].negate;
1045      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1046      this->result.negate = true;
1047      break;
1048   case ir_unop_floor:
1049      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1050      break;
1051   case ir_unop_fract:
1052      inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
1053      break;
1054   case ir_unop_round_even:
1055      emit(BRW_OPCODE_RNDE, this->result, op[0]);
1056      break;
1057
1058   case ir_binop_min:
1059      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1060      inst->conditional_mod = BRW_CONDITIONAL_L;
1061
1062      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1063      inst->predicated = true;
1064      break;
1065   case ir_binop_max:
1066      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1067      inst->conditional_mod = BRW_CONDITIONAL_G;
1068
1069      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1070      inst->predicated = true;
1071      break;
1072
1073   case ir_binop_pow:
1074      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1075      break;
1076
1077   case ir_unop_bit_not:
1078      inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
1079      break;
1080   case ir_binop_bit_and:
1081      inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
1082      break;
1083   case ir_binop_bit_xor:
1084      inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
1085      break;
1086   case ir_binop_bit_or:
1087      inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
1088      break;
1089
1090   case ir_unop_u2f:
1091   case ir_binop_lshift:
1092   case ir_binop_rshift:
1093      assert(!"GLSL 1.30 features unsupported");
1094      break;
1095   }
1096}
1097
1098void
1099fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1100				   const glsl_type *type, bool predicated)
1101{
1102   switch (type->base_type) {
1103   case GLSL_TYPE_FLOAT:
1104   case GLSL_TYPE_UINT:
1105   case GLSL_TYPE_INT:
1106   case GLSL_TYPE_BOOL:
1107      for (unsigned int i = 0; i < type->components(); i++) {
1108	 l.type = brw_type_for_base_type(type);
1109	 r.type = brw_type_for_base_type(type);
1110
1111	 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
1112	 inst->predicated = predicated;
1113
1114	 l.reg_offset++;
1115	 r.reg_offset++;
1116      }
1117      break;
1118   case GLSL_TYPE_ARRAY:
1119      for (unsigned int i = 0; i < type->length; i++) {
1120	 emit_assignment_writes(l, r, type->fields.array, predicated);
1121      }
1122      break;
1123
1124   case GLSL_TYPE_STRUCT:
1125      for (unsigned int i = 0; i < type->length; i++) {
1126	 emit_assignment_writes(l, r, type->fields.structure[i].type,
1127				predicated);
1128      }
1129      break;
1130
1131   case GLSL_TYPE_SAMPLER:
1132      break;
1133
1134   default:
1135      assert(!"not reached");
1136      break;
1137   }
1138}
1139
1140void
1141fs_visitor::visit(ir_assignment *ir)
1142{
1143   struct fs_reg l, r;
1144   fs_inst *inst;
1145
1146   /* FINISHME: arrays on the lhs */
1147   ir->lhs->accept(this);
1148   l = this->result;
1149
1150   ir->rhs->accept(this);
1151   r = this->result;
1152
1153   assert(l.file != BAD_FILE);
1154   assert(r.file != BAD_FILE);
1155
1156   if (ir->condition) {
1157      emit_bool_to_cond_code(ir->condition);
1158   }
1159
1160   if (ir->lhs->type->is_scalar() ||
1161       ir->lhs->type->is_vector()) {
1162      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1163	 if (ir->write_mask & (1 << i)) {
1164	    inst = emit(BRW_OPCODE_MOV, l, r);
1165	    if (ir->condition)
1166	       inst->predicated = true;
1167	    r.reg_offset++;
1168	 }
1169	 l.reg_offset++;
1170      }
1171   } else {
1172      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1173   }
1174}
1175
1176fs_inst *
1177fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1178{
1179   int mlen;
1180   int base_mrf = 1;
1181   bool simd16 = false;
1182   fs_reg orig_dst;
1183
1184   /* g0 header. */
1185   mlen = 1;
1186
1187   if (ir->shadow_comparitor) {
1188      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1189	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1190	 coordinate.reg_offset++;
1191      }
1192      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1193      mlen += 3;
1194
1195      if (ir->op == ir_tex) {
1196	 /* There's no plain shadow compare message, so we use shadow
1197	  * compare with a bias of 0.0.
1198	  */
1199	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1200	 mlen++;
1201      } else if (ir->op == ir_txb) {
1202	 ir->lod_info.bias->accept(this);
1203	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1204	 mlen++;
1205      } else {
1206	 assert(ir->op == ir_txl);
1207	 ir->lod_info.lod->accept(this);
1208	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1209	 mlen++;
1210      }
1211
1212      ir->shadow_comparitor->accept(this);
1213      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1214      mlen++;
1215   } else if (ir->op == ir_tex) {
1216      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1217	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1218	 coordinate.reg_offset++;
1219      }
1220      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1221      mlen += 3;
1222   } else if (ir->op == ir_txd) {
1223      assert(!"TXD isn't supported on gen4 yet.");
1224   } else {
1225      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1226       * instructions.  We'll need to do SIMD16 here.
1227       */
1228      assert(ir->op == ir_txb || ir->op == ir_txl);
1229
1230      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1231	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate);
1232	 coordinate.reg_offset++;
1233      }
1234
1235      /* lod/bias appears after u/v/r. */
1236      mlen += 6;
1237
1238      if (ir->op == ir_txb) {
1239	 ir->lod_info.bias->accept(this);
1240	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1241	 mlen++;
1242      } else {
1243	 ir->lod_info.lod->accept(this);
1244	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1245	 mlen++;
1246      }
1247
1248      /* The unused upper half. */
1249      mlen++;
1250
1251      /* Now, since we're doing simd16, the return is 2 interleaved
1252       * vec4s where the odd-indexed ones are junk. We'll need to move
1253       * this weirdness around to the expected layout.
1254       */
1255      simd16 = true;
1256      orig_dst = dst;
1257      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1258						       2));
1259      dst.type = BRW_REGISTER_TYPE_F;
1260   }
1261
1262   fs_inst *inst = NULL;
1263   switch (ir->op) {
1264   case ir_tex:
1265      inst = emit(FS_OPCODE_TEX, dst);
1266      break;
1267   case ir_txb:
1268      inst = emit(FS_OPCODE_TXB, dst);
1269      break;
1270   case ir_txl:
1271      inst = emit(FS_OPCODE_TXL, dst);
1272      break;
1273   case ir_txd:
1274      inst = emit(FS_OPCODE_TXD, dst);
1275      break;
1276   case ir_txf:
1277      assert(!"GLSL 1.30 features unsupported");
1278      break;
1279   }
1280   inst->base_mrf = base_mrf;
1281   inst->mlen = mlen;
1282
1283   if (simd16) {
1284      for (int i = 0; i < 4; i++) {
1285	 emit(BRW_OPCODE_MOV, orig_dst, dst);
1286	 orig_dst.reg_offset++;
1287	 dst.reg_offset += 2;
1288      }
1289   }
1290
1291   return inst;
1292}
1293
1294/* gen5's sampler has slots for u, v, r, array index, then optional
1295 * parameters like shadow comparitor or LOD bias.  If optional
1296 * parameters aren't present, those base slots are optional and don't
1297 * need to be included in the message.
1298 *
1299 * We don't fill in the unnecessary slots regardless, which may look
1300 * surprising in the disassembly.
1301 */
1302fs_inst *
1303fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1304{
1305   int mlen = 1; /* g0 header always present. */
1306   int base_mrf = 1;
1307   int reg_width = c->dispatch_width / 8;
1308
1309   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1310      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * reg_width),
1311	   coordinate);
1312      coordinate.reg_offset++;
1313   }
1314   mlen += ir->coordinate->type->vector_elements * reg_width;
1315
1316   if (ir->shadow_comparitor) {
1317      mlen = MAX2(mlen, 1 + 4 * reg_width);
1318
1319      ir->shadow_comparitor->accept(this);
1320      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1321      mlen += reg_width;
1322   }
1323
1324   fs_inst *inst = NULL;
1325   switch (ir->op) {
1326   case ir_tex:
1327      inst = emit(FS_OPCODE_TEX, dst);
1328      break;
1329   case ir_txb:
1330      ir->lod_info.bias->accept(this);
1331      mlen = MAX2(mlen, 1 + 4 * reg_width);
1332      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1333      mlen += reg_width;
1334
1335      inst = emit(FS_OPCODE_TXB, dst);
1336
1337      break;
1338   case ir_txl:
1339      ir->lod_info.lod->accept(this);
1340      mlen = MAX2(mlen, 1 + 4 * reg_width);
1341      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1342      mlen += reg_width;
1343
1344      inst = emit(FS_OPCODE_TXL, dst);
1345      break;
1346   case ir_txd:
1347   case ir_txf:
1348      assert(!"GLSL 1.30 features unsupported");
1349      break;
1350   }
1351   inst->base_mrf = base_mrf;
1352   inst->mlen = mlen;
1353
1354   if (mlen > 11) {
1355      fail("Message length >11 disallowed by hardware\n");
1356   }
1357
1358   return inst;
1359}
1360
1361fs_inst *
1362fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1363{
1364   int mlen = 1; /* g0 header always present. */
1365   int base_mrf = 1;
1366   int reg_width = c->dispatch_width / 8;
1367
1368   if (ir->shadow_comparitor) {
1369      ir->shadow_comparitor->accept(this);
1370      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1371      mlen += reg_width;
1372   }
1373
1374   /* Set up the LOD info */
1375   switch (ir->op) {
1376   case ir_tex:
1377      break;
1378   case ir_txb:
1379      ir->lod_info.bias->accept(this);
1380      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1381      mlen += reg_width;
1382      break;
1383   case ir_txl:
1384      ir->lod_info.lod->accept(this);
1385      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1386      mlen += reg_width;
1387      break;
1388   case ir_txd:
1389   case ir_txf:
1390      assert(!"GLSL 1.30 features unsupported");
1391      break;
1392   }
1393
1394   /* Set up the coordinate */
1395   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1396      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1397	   coordinate);
1398      coordinate.reg_offset++;
1399      mlen += reg_width;
1400   }
1401
1402   /* Generate the SEND */
1403   fs_inst *inst = NULL;
1404   switch (ir->op) {
1405   case ir_tex: inst = emit(FS_OPCODE_TEX, dst); break;
1406   case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
1407   case ir_txl: inst = emit(FS_OPCODE_TXL, dst); break;
1408   case ir_txd: inst = emit(FS_OPCODE_TXD, dst); break;
1409   case ir_txf: assert(!"TXF unsupported.");
1410   }
1411   inst->base_mrf = base_mrf;
1412   inst->mlen = mlen;
1413
1414   if (mlen > 11) {
1415      fail("Message length >11 disallowed by hardware\n");
1416   }
1417
1418   return inst;
1419}
1420
1421void
1422fs_visitor::visit(ir_texture *ir)
1423{
1424   int sampler;
1425   fs_inst *inst = NULL;
1426
1427   ir->coordinate->accept(this);
1428   fs_reg coordinate = this->result;
1429
1430   if (ir->offset != NULL) {
1431      ir_constant *offset = ir->offset->as_constant();
1432      assert(offset != NULL);
1433
1434      signed char offsets[3];
1435      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1436	 offsets[i] = (signed char) offset->value.i[i];
1437
1438      /* Combine all three offsets into a single unsigned dword:
1439       *
1440       *    bits 11:8 - U Offset (X component)
1441       *    bits  7:4 - V Offset (Y component)
1442       *    bits  3:0 - R Offset (Z component)
1443       */
1444      unsigned offset_bits = 0;
1445      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1446	 const unsigned shift = 4 * (2 - i);
1447	 offset_bits |= (offsets[i] << shift) & (0xF << shift);
1448      }
1449
1450      /* Explicitly set up the message header by copying g0 to msg reg m1. */
1451      emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1452	   fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1453
1454      /* Then set the offset bits in DWord 2 of the message header. */
1455      emit(BRW_OPCODE_MOV,
1456	   fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1457			 BRW_REGISTER_TYPE_UD)),
1458	   fs_reg(brw_imm_uw(offset_bits)));
1459   }
1460
1461   /* Should be lowered by do_lower_texture_projection */
1462   assert(!ir->projector);
1463
1464   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1465					     ctx->Shader.CurrentFragmentProgram,
1466					     &brw->fragment_program->Base);
1467   sampler = c->fp->program.Base.SamplerUnits[sampler];
1468
1469   /* The 965 requires the EU to do the normalization of GL rectangle
1470    * texture coordinates.  We use the program parameter state
1471    * tracking to get the scaling factor.
1472    */
1473   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1474      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1475      int tokens[STATE_LENGTH] = {
1476	 STATE_INTERNAL,
1477	 STATE_TEXRECT_SCALE,
1478	 sampler,
1479	 0,
1480	 0
1481      };
1482
1483      if (c->dispatch_width == 16) {
1484	 fail("rectangle scale uniform setup not supported on 16-wide\n");
1485	 this->result = fs_reg(this, ir->type);
1486	 return;
1487      }
1488
1489      c->prog_data.param_convert[c->prog_data.nr_params] =
1490	 PARAM_NO_CONVERT;
1491      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1492	 PARAM_NO_CONVERT;
1493
1494      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1495      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1496      GLuint index = _mesa_add_state_reference(params,
1497					       (gl_state_index *)tokens);
1498
1499      this->param_index[c->prog_data.nr_params] = index;
1500      this->param_offset[c->prog_data.nr_params] = 0;
1501      c->prog_data.nr_params++;
1502      this->param_index[c->prog_data.nr_params] = index;
1503      this->param_offset[c->prog_data.nr_params] = 1;
1504      c->prog_data.nr_params++;
1505
1506      fs_reg dst = fs_reg(this, ir->coordinate->type);
1507      fs_reg src = coordinate;
1508      coordinate = dst;
1509
1510      emit(BRW_OPCODE_MUL, dst, src, scale_x);
1511      dst.reg_offset++;
1512      src.reg_offset++;
1513      emit(BRW_OPCODE_MUL, dst, src, scale_y);
1514   }
1515
1516   /* Writemasking doesn't eliminate channels on SIMD8 texture
1517    * samples, so don't worry about them.
1518    */
1519   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1520
1521   if (intel->gen >= 7) {
1522      inst = emit_texture_gen7(ir, dst, coordinate);
1523   } else if (intel->gen >= 5) {
1524      inst = emit_texture_gen5(ir, dst, coordinate);
1525   } else {
1526      inst = emit_texture_gen4(ir, dst, coordinate);
1527   }
1528
1529   /* If there's an offset, we already set up m1.  To avoid the implied move,
1530    * use the null register.  Otherwise, we want an implied move from g0.
1531    */
1532   if (ir->offset != NULL)
1533      inst->src[0] = fs_reg(brw_null_reg());
1534   else
1535      inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1536
1537   inst->sampler = sampler;
1538
1539   this->result = dst;
1540
1541   if (ir->shadow_comparitor)
1542      inst->shadow_compare = true;
1543
1544   if (ir->type == glsl_type::float_type) {
1545      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1546      assert(ir->sampler->type->sampler_shadow);
1547   } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1548      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1549
1550      for (int i = 0; i < 4; i++) {
1551	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1552	 fs_reg l = swizzle_dst;
1553	 l.reg_offset += i;
1554
1555	 if (swiz == SWIZZLE_ZERO) {
1556	    emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1557	 } else if (swiz == SWIZZLE_ONE) {
1558	    emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1559	 } else {
1560	    fs_reg r = dst;
1561	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1562	    emit(BRW_OPCODE_MOV, l, r);
1563	 }
1564      }
1565      this->result = swizzle_dst;
1566   }
1567}
1568
1569void
1570fs_visitor::visit(ir_swizzle *ir)
1571{
1572   ir->val->accept(this);
1573   fs_reg val = this->result;
1574
1575   if (ir->type->vector_elements == 1) {
1576      this->result.reg_offset += ir->mask.x;
1577      return;
1578   }
1579
1580   fs_reg result = fs_reg(this, ir->type);
1581   this->result = result;
1582
1583   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1584      fs_reg channel = val;
1585      int swiz = 0;
1586
1587      switch (i) {
1588      case 0:
1589	 swiz = ir->mask.x;
1590	 break;
1591      case 1:
1592	 swiz = ir->mask.y;
1593	 break;
1594      case 2:
1595	 swiz = ir->mask.z;
1596	 break;
1597      case 3:
1598	 swiz = ir->mask.w;
1599	 break;
1600      }
1601
1602      channel.reg_offset += swiz;
1603      emit(BRW_OPCODE_MOV, result, channel);
1604      result.reg_offset++;
1605   }
1606}
1607
1608void
1609fs_visitor::visit(ir_discard *ir)
1610{
1611   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1612
1613   assert(ir->condition == NULL); /* FINISHME */
1614
1615   emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d);
1616   emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp);
1617   kill_emitted = true;
1618}
1619
1620void
1621fs_visitor::visit(ir_constant *ir)
1622{
1623   /* Set this->result to reg at the bottom of the function because some code
1624    * paths will cause this visitor to be applied to other fields.  This will
1625    * cause the value stored in this->result to be modified.
1626    *
1627    * Make reg constant so that it doesn't get accidentally modified along the
1628    * way.  Yes, I actually had this problem. :(
1629    */
1630   const fs_reg reg(this, ir->type);
1631   fs_reg dst_reg = reg;
1632
1633   if (ir->type->is_array()) {
1634      const unsigned size = type_size(ir->type->fields.array);
1635
1636      for (unsigned i = 0; i < ir->type->length; i++) {
1637	 ir->array_elements[i]->accept(this);
1638	 fs_reg src_reg = this->result;
1639
1640	 dst_reg.type = src_reg.type;
1641	 for (unsigned j = 0; j < size; j++) {
1642	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1643	    src_reg.reg_offset++;
1644	    dst_reg.reg_offset++;
1645	 }
1646      }
1647   } else if (ir->type->is_record()) {
1648      foreach_list(node, &ir->components) {
1649	 ir_instruction *const field = (ir_instruction *) node;
1650	 const unsigned size = type_size(field->type);
1651
1652	 field->accept(this);
1653	 fs_reg src_reg = this->result;
1654
1655	 dst_reg.type = src_reg.type;
1656	 for (unsigned j = 0; j < size; j++) {
1657	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1658	    src_reg.reg_offset++;
1659	    dst_reg.reg_offset++;
1660	 }
1661      }
1662   } else {
1663      const unsigned size = type_size(ir->type);
1664
1665      for (unsigned i = 0; i < size; i++) {
1666	 switch (ir->type->base_type) {
1667	 case GLSL_TYPE_FLOAT:
1668	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1669	    break;
1670	 case GLSL_TYPE_UINT:
1671	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1672	    break;
1673	 case GLSL_TYPE_INT:
1674	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1675	    break;
1676	 case GLSL_TYPE_BOOL:
1677	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1678	    break;
1679	 default:
1680	    assert(!"Non-float/uint/int/bool constant");
1681	 }
1682	 dst_reg.reg_offset++;
1683      }
1684   }
1685
1686   this->result = reg;
1687}
1688
1689void
1690fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1691{
1692   ir_expression *expr = ir->as_expression();
1693
1694   if (expr) {
1695      fs_reg op[2];
1696      fs_inst *inst;
1697
1698      assert(expr->get_num_operands() <= 2);
1699      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1700	 assert(expr->operands[i]->type->is_scalar());
1701
1702	 expr->operands[i]->accept(this);
1703	 op[i] = this->result;
1704      }
1705
1706      switch (expr->operation) {
1707      case ir_unop_logic_not:
1708	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1709	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1710	 break;
1711
1712      case ir_binop_logic_xor:
1713	 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1714	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1715	 break;
1716
1717      case ir_binop_logic_or:
1718	 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1719	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1720	 break;
1721
1722      case ir_binop_logic_and:
1723	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1724	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1725	 break;
1726
1727      case ir_unop_f2b:
1728	 if (intel->gen >= 6) {
1729	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1730	 } else {
1731	    inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1732	 }
1733	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1734	 break;
1735
1736      case ir_unop_i2b:
1737	 if (intel->gen >= 6) {
1738	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1739	 } else {
1740	    inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1741	 }
1742	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1743	 break;
1744
1745      case ir_binop_greater:
1746      case ir_binop_gequal:
1747      case ir_binop_less:
1748      case ir_binop_lequal:
1749      case ir_binop_equal:
1750      case ir_binop_all_equal:
1751      case ir_binop_nequal:
1752      case ir_binop_any_nequal:
1753	 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1754	 inst->conditional_mod =
1755	    brw_conditional_for_comparison(expr->operation);
1756	 break;
1757
1758      default:
1759	 assert(!"not reached");
1760	 fail("bad cond code\n");
1761	 break;
1762      }
1763      return;
1764   }
1765
1766   ir->accept(this);
1767
1768   if (intel->gen >= 6) {
1769      fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1770      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1771   } else {
1772      fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1773      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1774   }
1775}
1776
1777/**
1778 * Emit a gen6 IF statement with the comparison folded into the IF
1779 * instruction.
1780 */
1781void
1782fs_visitor::emit_if_gen6(ir_if *ir)
1783{
1784   ir_expression *expr = ir->condition->as_expression();
1785
1786   if (expr) {
1787      fs_reg op[2];
1788      fs_inst *inst;
1789      fs_reg temp;
1790
1791      assert(expr->get_num_operands() <= 2);
1792      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1793	 assert(expr->operands[i]->type->is_scalar());
1794
1795	 expr->operands[i]->accept(this);
1796	 op[i] = this->result;
1797      }
1798
1799      switch (expr->operation) {
1800      case ir_unop_logic_not:
1801	 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1802	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1803	 return;
1804
1805      case ir_binop_logic_xor:
1806	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1807	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1808	 return;
1809
1810      case ir_binop_logic_or:
1811	 temp = fs_reg(this, glsl_type::bool_type);
1812	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1813	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1814	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1815	 return;
1816
1817      case ir_binop_logic_and:
1818	 temp = fs_reg(this, glsl_type::bool_type);
1819	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1820	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1821	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1822	 return;
1823
1824      case ir_unop_f2b:
1825	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1826	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1827	 return;
1828
1829      case ir_unop_i2b:
1830	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1831	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1832	 return;
1833
1834      case ir_binop_greater:
1835      case ir_binop_gequal:
1836      case ir_binop_less:
1837      case ir_binop_lequal:
1838      case ir_binop_equal:
1839      case ir_binop_all_equal:
1840      case ir_binop_nequal:
1841      case ir_binop_any_nequal:
1842	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1843	 inst->conditional_mod =
1844	    brw_conditional_for_comparison(expr->operation);
1845	 return;
1846      default:
1847	 assert(!"not reached");
1848	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1849	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1850	 fail("bad condition\n");
1851	 return;
1852      }
1853      return;
1854   }
1855
1856   ir->condition->accept(this);
1857
1858   fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1859   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1860}
1861
1862void
1863fs_visitor::visit(ir_if *ir)
1864{
1865   fs_inst *inst;
1866
1867   if (c->dispatch_width == 16) {
1868      fail("Can't support (non-uniform) control flow on 16-wide\n");
1869   }
1870
1871   /* Don't point the annotation at the if statement, because then it plus
1872    * the then and else blocks get printed.
1873    */
1874   this->base_ir = ir->condition;
1875
1876   if (intel->gen >= 6) {
1877      emit_if_gen6(ir);
1878   } else {
1879      emit_bool_to_cond_code(ir->condition);
1880
1881      inst = emit(BRW_OPCODE_IF);
1882      inst->predicated = true;
1883   }
1884
1885   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1886      ir_instruction *ir = (ir_instruction *)iter.get();
1887      this->base_ir = ir;
1888
1889      ir->accept(this);
1890   }
1891
1892   if (!ir->else_instructions.is_empty()) {
1893      emit(BRW_OPCODE_ELSE);
1894
1895      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1896	 ir_instruction *ir = (ir_instruction *)iter.get();
1897	 this->base_ir = ir;
1898
1899	 ir->accept(this);
1900      }
1901   }
1902
1903   emit(BRW_OPCODE_ENDIF);
1904}
1905
1906void
1907fs_visitor::visit(ir_loop *ir)
1908{
1909   fs_reg counter = reg_undef;
1910
1911   if (c->dispatch_width == 16) {
1912      fail("Can't support (non-uniform) control flow on 16-wide\n");
1913   }
1914
1915   if (ir->counter) {
1916      this->base_ir = ir->counter;
1917      ir->counter->accept(this);
1918      counter = *(variable_storage(ir->counter));
1919
1920      if (ir->from) {
1921	 this->base_ir = ir->from;
1922	 ir->from->accept(this);
1923
1924	 emit(BRW_OPCODE_MOV, counter, this->result);
1925      }
1926   }
1927
1928   emit(BRW_OPCODE_DO);
1929
1930   if (ir->to) {
1931      this->base_ir = ir->to;
1932      ir->to->accept(this);
1933
1934      fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1935      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1936
1937      inst = emit(BRW_OPCODE_BREAK);
1938      inst->predicated = true;
1939   }
1940
1941   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1942      ir_instruction *ir = (ir_instruction *)iter.get();
1943
1944      this->base_ir = ir;
1945      ir->accept(this);
1946   }
1947
1948   if (ir->increment) {
1949      this->base_ir = ir->increment;
1950      ir->increment->accept(this);
1951      emit(BRW_OPCODE_ADD, counter, counter, this->result);
1952   }
1953
1954   emit(BRW_OPCODE_WHILE);
1955}
1956
1957void
1958fs_visitor::visit(ir_loop_jump *ir)
1959{
1960   switch (ir->mode) {
1961   case ir_loop_jump::jump_break:
1962      emit(BRW_OPCODE_BREAK);
1963      break;
1964   case ir_loop_jump::jump_continue:
1965      emit(BRW_OPCODE_CONTINUE);
1966      break;
1967   }
1968}
1969
1970void
1971fs_visitor::visit(ir_call *ir)
1972{
1973   assert(!"FINISHME");
1974}
1975
1976void
1977fs_visitor::visit(ir_return *ir)
1978{
1979   assert(!"FINISHME");
1980}
1981
1982void
1983fs_visitor::visit(ir_function *ir)
1984{
1985   /* Ignore function bodies other than main() -- we shouldn't see calls to
1986    * them since they should all be inlined before we get to ir_to_mesa.
1987    */
1988   if (strcmp(ir->name, "main") == 0) {
1989      const ir_function_signature *sig;
1990      exec_list empty;
1991
1992      sig = ir->matching_signature(&empty);
1993
1994      assert(sig);
1995
1996      foreach_iter(exec_list_iterator, iter, sig->body) {
1997	 ir_instruction *ir = (ir_instruction *)iter.get();
1998	 this->base_ir = ir;
1999
2000	 ir->accept(this);
2001      }
2002   }
2003}
2004
2005void
2006fs_visitor::visit(ir_function_signature *ir)
2007{
2008   assert(!"not reached");
2009   (void)ir;
2010}
2011
2012fs_inst *
2013fs_visitor::emit(fs_inst inst)
2014{
2015   fs_inst *list_inst = new(mem_ctx) fs_inst;
2016   *list_inst = inst;
2017
2018   if (force_uncompressed_stack > 0)
2019      list_inst->force_uncompressed = true;
2020   else if (force_sechalf_stack > 0)
2021      list_inst->force_sechalf = true;
2022
2023   list_inst->annotation = this->current_annotation;
2024   list_inst->ir = this->base_ir;
2025
2026   this->instructions.push_tail(list_inst);
2027
2028   return list_inst;
2029}
2030
2031/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
2032void
2033fs_visitor::emit_dummy_fs()
2034{
2035   /* Everyone's favorite color. */
2036   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
2037   emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
2038   emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
2039   emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
2040
2041   fs_inst *write;
2042   write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
2043   write->base_mrf = 0;
2044}
2045
2046/* The register location here is relative to the start of the URB
2047 * data.  It will get adjusted to be a real location before
2048 * generate_code() time.
2049 */
2050struct brw_reg
2051fs_visitor::interp_reg(int location, int channel)
2052{
2053   int regnr = urb_setup[location] * 2 + channel / 2;
2054   int stride = (channel & 1) * 4;
2055
2056   assert(urb_setup[location] != -1);
2057
2058   return brw_vec1_grf(regnr, stride);
2059}
2060
2061/** Emits the interpolation for the varying inputs. */
2062void
2063fs_visitor::emit_interpolation_setup_gen4()
2064{
2065   this->current_annotation = "compute pixel centers";
2066   this->pixel_x = fs_reg(this, glsl_type::uint_type);
2067   this->pixel_y = fs_reg(this, glsl_type::uint_type);
2068   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
2069   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
2070
2071   emit(FS_OPCODE_PIXEL_X, this->pixel_x);
2072   emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
2073
2074   this->current_annotation = "compute pixel deltas from v0";
2075   if (brw->has_pln) {
2076      this->delta_x = fs_reg(this, glsl_type::vec2_type);
2077      this->delta_y = this->delta_x;
2078      this->delta_y.reg_offset++;
2079   } else {
2080      this->delta_x = fs_reg(this, glsl_type::float_type);
2081      this->delta_y = fs_reg(this, glsl_type::float_type);
2082   }
2083   emit(BRW_OPCODE_ADD, this->delta_x,
2084	this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
2085   emit(BRW_OPCODE_ADD, this->delta_y,
2086	this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
2087
2088   this->current_annotation = "compute pos.w and 1/pos.w";
2089   /* Compute wpos.w.  It's always in our setup, since it's needed to
2090    * interpolate the other attributes.
2091    */
2092   this->wpos_w = fs_reg(this, glsl_type::float_type);
2093   emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
2094	interp_reg(FRAG_ATTRIB_WPOS, 3));
2095   /* Compute the pixel 1/W value from wpos.w. */
2096   this->pixel_w = fs_reg(this, glsl_type::float_type);
2097   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2098   this->current_annotation = NULL;
2099}
2100
2101/** Emits the interpolation for the varying inputs. */
2102void
2103fs_visitor::emit_interpolation_setup_gen6()
2104{
2105   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2106
2107   /* If the pixel centers end up used, the setup is the same as for gen4. */
2108   this->current_annotation = "compute pixel centers";
2109   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2110   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2111   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2112   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2113   emit(BRW_OPCODE_ADD,
2114	int_pixel_x,
2115	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2116	fs_reg(brw_imm_v(0x10101010)));
2117   emit(BRW_OPCODE_ADD,
2118	int_pixel_y,
2119	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2120	fs_reg(brw_imm_v(0x11001100)));
2121
2122   /* As of gen6, we can no longer mix float and int sources.  We have
2123    * to turn the integer pixel centers into floats for their actual
2124    * use.
2125    */
2126   this->pixel_x = fs_reg(this, glsl_type::float_type);
2127   this->pixel_y = fs_reg(this, glsl_type::float_type);
2128   emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
2129   emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
2130
2131   this->current_annotation = "compute pos.w";
2132   this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2133   this->wpos_w = fs_reg(this, glsl_type::float_type);
2134   emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w);
2135
2136   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2137   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2138
2139   this->current_annotation = NULL;
2140}
2141
2142void
2143fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
2144{
2145   int reg_width = c->dispatch_width / 8;
2146
2147   if (c->dispatch_width == 8 || intel->gen == 6) {
2148      /* SIMD8 write looks like:
2149       * m + 0: r0
2150       * m + 1: r1
2151       * m + 2: g0
2152       * m + 3: g1
2153       *
2154       * gen6 SIMD16 DP write looks like:
2155       * m + 0: r0
2156       * m + 1: r1
2157       * m + 2: g0
2158       * m + 3: g1
2159       * m + 4: b0
2160       * m + 5: b1
2161       * m + 6: a0
2162       * m + 7: a1
2163       */
2164      emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width),
2165	   color);
2166   } else {
2167      /* pre-gen6 SIMD16 single source DP write looks like:
2168       * m + 0: r0
2169       * m + 1: g0
2170       * m + 2: b0
2171       * m + 3: a0
2172       * m + 4: r1
2173       * m + 5: g1
2174       * m + 6: b1
2175       * m + 7: a1
2176       */
2177      if (brw->has_compr4) {
2178	 /* By setting the high bit of the MRF register number, we
2179	  * indicate that we want COMPR4 mode - instead of doing the
2180	  * usual destination + 1 for the second half we get
2181	  * destination + 4.
2182	  */
2183	 emit(BRW_OPCODE_MOV,
2184	      fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color);
2185      } else {
2186	 push_force_uncompressed();
2187	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color);
2188	 pop_force_uncompressed();
2189
2190	 push_force_sechalf();
2191	 color.sechalf = true;
2192	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color);
2193	 pop_force_sechalf();
2194	 color.sechalf = false;
2195      }
2196   }
2197}
2198
2199void
2200fs_visitor::emit_fb_writes()
2201{
2202   this->current_annotation = "FB write header";
2203   GLboolean header_present = GL_TRUE;
2204   int nr = 0;
2205   int reg_width = c->dispatch_width / 8;
2206
2207   if (intel->gen >= 6 &&
2208       !this->kill_emitted &&
2209       c->key.nr_color_regions == 1) {
2210      header_present = false;
2211   }
2212
2213   if (header_present) {
2214      /* m0, m1 header */
2215      nr += 2;
2216   }
2217
2218   if (c->aa_dest_stencil_reg) {
2219      push_force_uncompressed();
2220      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2221	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2222      pop_force_uncompressed();
2223   }
2224
2225   /* Reserve space for color. It'll be filled in per MRT below. */
2226   int color_mrf = nr;
2227   nr += 4 * reg_width;
2228
2229   if (c->source_depth_to_render_target) {
2230      if (intel->gen == 6 && c->dispatch_width == 16) {
2231	 /* For outputting oDepth on gen6, SIMD8 writes have to be
2232	  * used.  This would require 8-wide moves of each half to
2233	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
2234	  * Just bail on doing so for now.
2235	  */
2236	 fail("Missing support for simd16 depth writes on gen6\n");
2237      }
2238
2239      if (c->computes_depth) {
2240	 /* Hand over gl_FragDepth. */
2241	 assert(this->frag_depth);
2242	 fs_reg depth = *(variable_storage(this->frag_depth));
2243
2244	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
2245      } else {
2246	 /* Pass through the payload depth. */
2247	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2248	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2249      }
2250      nr += reg_width;
2251   }
2252
2253   if (c->dest_depth_reg) {
2254      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2255	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2256      nr += reg_width;
2257   }
2258
2259   fs_reg color = reg_undef;
2260   if (this->frag_color)
2261      color = *(variable_storage(this->frag_color));
2262   else if (this->frag_data) {
2263      color = *(variable_storage(this->frag_data));
2264      color.type = BRW_REGISTER_TYPE_F;
2265   }
2266
2267   for (int target = 0; target < c->key.nr_color_regions; target++) {
2268      this->current_annotation = ralloc_asprintf(this->mem_ctx,
2269						 "FB write target %d",
2270						 target);
2271      if (this->frag_color || this->frag_data) {
2272	 for (int i = 0; i < 4; i++) {
2273	    emit_color_write(i, color_mrf, color);
2274	    color.reg_offset++;
2275	 }
2276      }
2277
2278      if (this->frag_color)
2279	 color.reg_offset -= 4;
2280
2281      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2282      inst->target = target;
2283      inst->base_mrf = 0;
2284      inst->mlen = nr;
2285      if (target == c->key.nr_color_regions - 1)
2286	 inst->eot = true;
2287      inst->header_present = header_present;
2288   }
2289
2290   if (c->key.nr_color_regions == 0) {
2291      if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2292	 /* If the alpha test is enabled but there's no color buffer,
2293	  * we still need to send alpha out the pipeline to our null
2294	  * renderbuffer.
2295	  */
2296	 color.reg_offset += 3;
2297	 emit_color_write(3, color_mrf, color);
2298      }
2299
2300      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2301      inst->base_mrf = 0;
2302      inst->mlen = nr;
2303      inst->eot = true;
2304      inst->header_present = header_present;
2305   }
2306
2307   this->current_annotation = NULL;
2308}
2309
2310void
2311fs_visitor::generate_fb_write(fs_inst *inst)
2312{
2313   GLboolean eot = inst->eot;
2314   struct brw_reg implied_header;
2315
2316   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2317    * move, here's g1.
2318    */
2319   brw_push_insn_state(p);
2320   brw_set_mask_control(p, BRW_MASK_DISABLE);
2321   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2322
2323   if (inst->header_present) {
2324      if (intel->gen >= 6) {
2325	 brw_MOV(p,
2326		 brw_message_reg(inst->base_mrf),
2327		 brw_vec8_grf(0, 0));
2328
2329	 if (inst->target > 0) {
2330	    /* Set the render target index for choosing BLEND_STATE. */
2331	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2332			      BRW_REGISTER_TYPE_UD),
2333		    brw_imm_ud(inst->target));
2334	 }
2335
2336	 /* Clear viewport index, render target array index. */
2337	 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2338			   BRW_REGISTER_TYPE_UD),
2339		 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2340		 brw_imm_ud(0xf7ff));
2341
2342	 implied_header = brw_null_reg();
2343      } else {
2344	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2345      }
2346
2347      brw_MOV(p,
2348	      brw_message_reg(inst->base_mrf + 1),
2349	      brw_vec8_grf(1, 0));
2350   } else {
2351      implied_header = brw_null_reg();
2352   }
2353
2354   brw_pop_insn_state(p);
2355
2356   brw_fb_WRITE(p,
2357		c->dispatch_width,
2358		inst->base_mrf,
2359		implied_header,
2360		inst->target,
2361		inst->mlen,
2362		0,
2363		eot,
2364		inst->header_present);
2365}
2366
2367/* Computes the integer pixel x,y values from the origin.
2368 *
2369 * This is the basis of gl_FragCoord computation, but is also used
2370 * pre-gen6 for computing the deltas from v0 for computing
2371 * interpolation.
2372 */
2373void
2374fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
2375{
2376   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2377   struct brw_reg src;
2378   struct brw_reg deltas;
2379
2380   if (is_x) {
2381      src = stride(suboffset(g1_uw, 4), 2, 4, 0);
2382      deltas = brw_imm_v(0x10101010);
2383   } else {
2384      src = stride(suboffset(g1_uw, 5), 2, 4, 0);
2385      deltas = brw_imm_v(0x11001100);
2386   }
2387
2388   if (c->dispatch_width == 16) {
2389      dst = vec16(dst);
2390   }
2391
2392   /* We do this 8 or 16-wide, but since the destination is UW we
2393    * don't do compression in the 16-wide case.
2394    */
2395   brw_push_insn_state(p);
2396   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2397   brw_ADD(p, dst, src, deltas);
2398   brw_pop_insn_state(p);
2399}
2400
2401void
2402fs_visitor::generate_linterp(fs_inst *inst,
2403			     struct brw_reg dst, struct brw_reg *src)
2404{
2405   struct brw_reg delta_x = src[0];
2406   struct brw_reg delta_y = src[1];
2407   struct brw_reg interp = src[2];
2408
2409   if (brw->has_pln &&
2410       delta_y.nr == delta_x.nr + 1 &&
2411       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2412      brw_PLN(p, dst, interp, delta_x);
2413   } else {
2414      brw_LINE(p, brw_null_reg(), interp, delta_x);
2415      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2416   }
2417}
2418
2419void
2420fs_visitor::generate_math(fs_inst *inst,
2421			  struct brw_reg dst, struct brw_reg *src)
2422{
2423   int op;
2424
2425   switch (inst->opcode) {
2426   case FS_OPCODE_RCP:
2427      op = BRW_MATH_FUNCTION_INV;
2428      break;
2429   case FS_OPCODE_RSQ:
2430      op = BRW_MATH_FUNCTION_RSQ;
2431      break;
2432   case FS_OPCODE_SQRT:
2433      op = BRW_MATH_FUNCTION_SQRT;
2434      break;
2435   case FS_OPCODE_EXP2:
2436      op = BRW_MATH_FUNCTION_EXP;
2437      break;
2438   case FS_OPCODE_LOG2:
2439      op = BRW_MATH_FUNCTION_LOG;
2440      break;
2441   case FS_OPCODE_POW:
2442      op = BRW_MATH_FUNCTION_POW;
2443      break;
2444   case FS_OPCODE_SIN:
2445      op = BRW_MATH_FUNCTION_SIN;
2446      break;
2447   case FS_OPCODE_COS:
2448      op = BRW_MATH_FUNCTION_COS;
2449      break;
2450   default:
2451      assert(!"not reached: unknown math function");
2452      op = 0;
2453      break;
2454   }
2455
2456   if (intel->gen >= 6) {
2457      assert(inst->mlen == 0);
2458
2459      if (inst->opcode == FS_OPCODE_POW) {
2460	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2461	 brw_math2(p, dst, op, src[0], src[1]);
2462
2463	 if (c->dispatch_width == 16) {
2464	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2465	    brw_math2(p, sechalf(dst), op, sechalf(src[0]), sechalf(src[1]));
2466	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2467	 }
2468      } else {
2469	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2470	 brw_math(p, dst,
2471		  op,
2472		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2473		  BRW_MATH_SATURATE_NONE,
2474		  0, src[0],
2475		  BRW_MATH_DATA_VECTOR,
2476		  BRW_MATH_PRECISION_FULL);
2477
2478	 if (c->dispatch_width == 16) {
2479	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2480	    brw_math(p, sechalf(dst),
2481		     op,
2482		     inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2483		     BRW_MATH_SATURATE_NONE,
2484		     0, sechalf(src[0]),
2485		     BRW_MATH_DATA_VECTOR,
2486		     BRW_MATH_PRECISION_FULL);
2487	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2488	 }
2489      }
2490   } else /* gen <= 5 */{
2491      assert(inst->mlen >= 1);
2492
2493      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2494      brw_math(p, dst,
2495	       op,
2496	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2497	       BRW_MATH_SATURATE_NONE,
2498	       inst->base_mrf, src[0],
2499	       BRW_MATH_DATA_VECTOR,
2500	       BRW_MATH_PRECISION_FULL);
2501
2502      if (c->dispatch_width == 16) {
2503	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2504	 brw_math(p, sechalf(dst),
2505		  op,
2506		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2507		  BRW_MATH_SATURATE_NONE,
2508		  inst->base_mrf + 1, sechalf(src[0]),
2509		  BRW_MATH_DATA_VECTOR,
2510		  BRW_MATH_PRECISION_FULL);
2511
2512	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2513      }
2514   }
2515}
2516
2517void
2518fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2519{
2520   int msg_type = -1;
2521   int rlen = 4;
2522   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2523
2524   if (c->dispatch_width == 16) {
2525      rlen = 8;
2526      dst = vec16(dst);
2527      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2528   }
2529
2530   if (intel->gen >= 5) {
2531      switch (inst->opcode) {
2532      case FS_OPCODE_TEX:
2533	 if (inst->shadow_compare) {
2534	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
2535	 } else {
2536	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
2537	 }
2538	 break;
2539      case FS_OPCODE_TXB:
2540	 if (inst->shadow_compare) {
2541	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
2542	 } else {
2543	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
2544	 }
2545	 break;
2546      case FS_OPCODE_TXL:
2547	 if (inst->shadow_compare) {
2548	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
2549	 } else {
2550	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
2551	 }
2552	 break;
2553      case FS_OPCODE_TXD:
2554	 assert(!"TXD isn't supported on gen5+ yet.");
2555	 break;
2556      }
2557   } else {
2558      switch (inst->opcode) {
2559      case FS_OPCODE_TEX:
2560	 /* Note that G45 and older determines shadow compare and dispatch width
2561	  * from message length for most messages.
2562	  */
2563	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2564	 if (inst->shadow_compare) {
2565	    assert(inst->mlen == 6);
2566	 } else {
2567	    assert(inst->mlen <= 4);
2568	 }
2569	 break;
2570      case FS_OPCODE_TXB:
2571	 if (inst->shadow_compare) {
2572	    assert(inst->mlen == 6);
2573	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
2574	 } else {
2575	    assert(inst->mlen == 9);
2576	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2577	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2578	 }
2579	 break;
2580      case FS_OPCODE_TXL:
2581	 if (inst->shadow_compare) {
2582	    assert(inst->mlen == 6);
2583	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
2584	 } else {
2585	    assert(inst->mlen == 9);
2586	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
2587	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2588	 }
2589	 break;
2590      case FS_OPCODE_TXD:
2591	 assert(!"TXD isn't supported on gen4 yet.");
2592	 break;
2593      }
2594   }
2595   assert(msg_type != -1);
2596
2597   brw_SAMPLE(p,
2598	      retype(dst, BRW_REGISTER_TYPE_UW),
2599	      inst->base_mrf,
2600	      src,
2601              SURF_INDEX_TEXTURE(inst->sampler),
2602	      inst->sampler,
2603	      WRITEMASK_XYZW,
2604	      msg_type,
2605	      rlen,
2606	      inst->mlen,
2607	      0,
2608	      1,
2609	      simd_mode);
2610}
2611
2612
2613/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2614 * looking like:
2615 *
2616 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2617 *
2618 * and we're trying to produce:
2619 *
2620 *           DDX                     DDY
2621 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2622 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2623 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2624 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2625 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2626 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2627 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2628 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2629 *
2630 * and add another set of two more subspans if in 16-pixel dispatch mode.
2631 *
2632 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2633 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2634 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2635 * between each other.  We could probably do it like ddx and swizzle the right
2636 * order later, but bail for now and just produce
2637 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2638 */
2639void
2640fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2641{
2642   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2643				 BRW_REGISTER_TYPE_F,
2644				 BRW_VERTICAL_STRIDE_2,
2645				 BRW_WIDTH_2,
2646				 BRW_HORIZONTAL_STRIDE_0,
2647				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2648   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2649				 BRW_REGISTER_TYPE_F,
2650				 BRW_VERTICAL_STRIDE_2,
2651				 BRW_WIDTH_2,
2652				 BRW_HORIZONTAL_STRIDE_0,
2653				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2654   brw_ADD(p, dst, src0, negate(src1));
2655}
2656
2657void
2658fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2659{
2660   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2661				 BRW_REGISTER_TYPE_F,
2662				 BRW_VERTICAL_STRIDE_4,
2663				 BRW_WIDTH_4,
2664				 BRW_HORIZONTAL_STRIDE_0,
2665				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2666   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2667				 BRW_REGISTER_TYPE_F,
2668				 BRW_VERTICAL_STRIDE_4,
2669				 BRW_WIDTH_4,
2670				 BRW_HORIZONTAL_STRIDE_0,
2671				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2672   brw_ADD(p, dst, src0, negate(src1));
2673}
2674
2675void
2676fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2677{
2678   if (intel->gen >= 6) {
2679      /* Gen6 no longer has the mask reg for us to just read the
2680       * active channels from.  However, cmp updates just the channels
2681       * of the flag reg that are enabled, so we can get at the
2682       * channel enables that way.  In this step, make a reg of ones
2683       * we'll compare to.
2684       */
2685      brw_MOV(p, mask, brw_imm_ud(1));
2686   } else {
2687      brw_push_insn_state(p);
2688      brw_set_mask_control(p, BRW_MASK_DISABLE);
2689      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2690      brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2691      brw_pop_insn_state(p);
2692   }
2693}
2694
2695void
2696fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2697{
2698   if (intel->gen >= 6) {
2699      struct brw_reg f0 = brw_flag_reg();
2700      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2701
2702      brw_push_insn_state(p);
2703      brw_set_mask_control(p, BRW_MASK_DISABLE);
2704      brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2705      brw_pop_insn_state(p);
2706
2707      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2708	      BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2709      /* Undo CMP's whacking of predication*/
2710      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2711
2712      brw_push_insn_state(p);
2713      brw_set_mask_control(p, BRW_MASK_DISABLE);
2714      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2715      brw_AND(p, g1, f0, g1);
2716      brw_pop_insn_state(p);
2717   } else {
2718      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2719
2720      mask = brw_uw1_reg(mask.file, mask.nr, 0);
2721
2722      brw_push_insn_state(p);
2723      brw_set_mask_control(p, BRW_MASK_DISABLE);
2724      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2725      brw_AND(p, g0, mask, g0);
2726      brw_pop_insn_state(p);
2727   }
2728}
2729
2730void
2731fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2732{
2733   assert(inst->mlen != 0);
2734
2735   brw_MOV(p,
2736	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2737	   retype(src, BRW_REGISTER_TYPE_UD));
2738   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2739				 inst->offset);
2740}
2741
2742void
2743fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2744{
2745   assert(inst->mlen != 0);
2746
2747   /* Clear any post destination dependencies that would be ignored by
2748    * the block read.  See the B-Spec for pre-gen5 send instruction.
2749    *
2750    * This could use a better solution, since texture sampling and
2751    * math reads could potentially run into it as well -- anywhere
2752    * that we have a SEND with a destination that is a register that
2753    * was written but not read within the last N instructions (what's
2754    * N?  unsure).  This is rare because of dead code elimination, but
2755    * not impossible.
2756    */
2757   if (intel->gen == 4 && !intel->is_g4x)
2758      brw_MOV(p, brw_null_reg(), dst);
2759
2760   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2761				inst->offset);
2762
2763   if (intel->gen == 4 && !intel->is_g4x) {
2764      /* gen4 errata: destination from a send can't be used as a
2765       * destination until it's been read.  Just read it so we don't
2766       * have to worry.
2767       */
2768      brw_MOV(p, brw_null_reg(), dst);
2769   }
2770}
2771
2772
2773void
2774fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2775{
2776   assert(inst->mlen != 0);
2777
2778   /* Clear any post destination dependencies that would be ignored by
2779    * the block read.  See the B-Spec for pre-gen5 send instruction.
2780    *
2781    * This could use a better solution, since texture sampling and
2782    * math reads could potentially run into it as well -- anywhere
2783    * that we have a SEND with a destination that is a register that
2784    * was written but not read within the last N instructions (what's
2785    * N?  unsure).  This is rare because of dead code elimination, but
2786    * not impossible.
2787    */
2788   if (intel->gen == 4 && !intel->is_g4x)
2789      brw_MOV(p, brw_null_reg(), dst);
2790
2791   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2792			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2793
2794   if (intel->gen == 4 && !intel->is_g4x) {
2795      /* gen4 errata: destination from a send can't be used as a
2796       * destination until it's been read.  Just read it so we don't
2797       * have to worry.
2798       */
2799      brw_MOV(p, brw_null_reg(), dst);
2800   }
2801}
2802
2803/**
2804 * To be called after the last _mesa_add_state_reference() call, to
2805 * set up prog_data.param[] for assign_curb_setup() and
2806 * setup_pull_constants().
2807 */
2808void
2809fs_visitor::setup_paramvalues_refs()
2810{
2811   if (c->dispatch_width != 8)
2812      return;
2813
2814   /* Set up the pointers to ParamValues now that that array is finalized. */
2815   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2816      c->prog_data.param[i] =
2817	 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2818	 this->param_offset[i];
2819   }
2820}
2821
2822void
2823fs_visitor::assign_curb_setup()
2824{
2825   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2826   if (c->dispatch_width == 8) {
2827      c->prog_data.first_curbe_grf = c->nr_payload_regs;
2828   } else {
2829      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
2830   }
2831
2832   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2833   foreach_iter(exec_list_iterator, iter, this->instructions) {
2834      fs_inst *inst = (fs_inst *)iter.get();
2835
2836      for (unsigned int i = 0; i < 3; i++) {
2837	 if (inst->src[i].file == UNIFORM) {
2838	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2839	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
2840						  constant_nr / 8,
2841						  constant_nr % 8);
2842
2843	    inst->src[i].file = FIXED_HW_REG;
2844	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2845	 }
2846      }
2847   }
2848}
2849
2850void
2851fs_visitor::calculate_urb_setup()
2852{
2853   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2854      urb_setup[i] = -1;
2855   }
2856
2857   int urb_next = 0;
2858   /* Figure out where each of the incoming setup attributes lands. */
2859   if (intel->gen >= 6) {
2860      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2861	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2862	    urb_setup[i] = urb_next++;
2863	 }
2864      }
2865   } else {
2866      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2867      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2868	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2869	    int fp_index;
2870
2871	    if (i >= VERT_RESULT_VAR0)
2872	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2873	    else if (i <= VERT_RESULT_TEX7)
2874	       fp_index = i;
2875	    else
2876	       fp_index = -1;
2877
2878	    if (fp_index >= 0)
2879	       urb_setup[fp_index] = urb_next++;
2880	 }
2881      }
2882   }
2883
2884   /* Each attribute is 4 setup channels, each of which is half a reg. */
2885   c->prog_data.urb_read_length = urb_next * 2;
2886}
2887
2888void
2889fs_visitor::assign_urb_setup()
2890{
2891   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
2892
2893   /* Offset all the urb_setup[] index by the actual position of the
2894    * setup regs, now that the location of the constants has been chosen.
2895    */
2896   foreach_iter(exec_list_iterator, iter, this->instructions) {
2897      fs_inst *inst = (fs_inst *)iter.get();
2898
2899      if (inst->opcode == FS_OPCODE_LINTERP) {
2900	 assert(inst->src[2].file == FIXED_HW_REG);
2901	 inst->src[2].fixed_hw_reg.nr += urb_start;
2902      }
2903
2904      if (inst->opcode == FS_OPCODE_CINTERP) {
2905	 assert(inst->src[0].file == FIXED_HW_REG);
2906	 inst->src[0].fixed_hw_reg.nr += urb_start;
2907      }
2908   }
2909
2910   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2911}
2912
2913/**
2914 * Split large virtual GRFs into separate components if we can.
2915 *
2916 * This is mostly duplicated with what brw_fs_vector_splitting does,
2917 * but that's really conservative because it's afraid of doing
2918 * splitting that doesn't result in real progress after the rest of
2919 * the optimization phases, which would cause infinite looping in
2920 * optimization.  We can do it once here, safely.  This also has the
2921 * opportunity to split interpolated values, or maybe even uniforms,
2922 * which we don't have at the IR level.
2923 *
2924 * We want to split, because virtual GRFs are what we register
2925 * allocate and spill (due to contiguousness requirements for some
2926 * instructions), and they're what we naturally generate in the
2927 * codegen process, but most virtual GRFs don't actually need to be
2928 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2929 * live intervals and better dead code elimination and coalescing.
2930 */
2931void
2932fs_visitor::split_virtual_grfs()
2933{
2934   int num_vars = this->virtual_grf_next;
2935   bool split_grf[num_vars];
2936   int new_virtual_grf[num_vars];
2937
2938   /* Try to split anything > 0 sized. */
2939   for (int i = 0; i < num_vars; i++) {
2940      if (this->virtual_grf_sizes[i] != 1)
2941	 split_grf[i] = true;
2942      else
2943	 split_grf[i] = false;
2944   }
2945
2946   if (brw->has_pln) {
2947      /* PLN opcodes rely on the delta_xy being contiguous. */
2948      split_grf[this->delta_x.reg] = false;
2949   }
2950
2951   foreach_iter(exec_list_iterator, iter, this->instructions) {
2952      fs_inst *inst = (fs_inst *)iter.get();
2953
2954      /* Texturing produces 4 contiguous registers, so no splitting. */
2955      if (inst->is_tex()) {
2956	 split_grf[inst->dst.reg] = false;
2957      }
2958   }
2959
2960   /* Allocate new space for split regs.  Note that the virtual
2961    * numbers will be contiguous.
2962    */
2963   for (int i = 0; i < num_vars; i++) {
2964      if (split_grf[i]) {
2965	 new_virtual_grf[i] = virtual_grf_alloc(1);
2966	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2967	    int reg = virtual_grf_alloc(1);
2968	    assert(reg == new_virtual_grf[i] + j - 1);
2969	    (void) reg;
2970	 }
2971	 this->virtual_grf_sizes[i] = 1;
2972      }
2973   }
2974
2975   foreach_iter(exec_list_iterator, iter, this->instructions) {
2976      fs_inst *inst = (fs_inst *)iter.get();
2977
2978      if (inst->dst.file == GRF &&
2979	  split_grf[inst->dst.reg] &&
2980	  inst->dst.reg_offset != 0) {
2981	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2982			  inst->dst.reg_offset - 1);
2983	 inst->dst.reg_offset = 0;
2984      }
2985      for (int i = 0; i < 3; i++) {
2986	 if (inst->src[i].file == GRF &&
2987	     split_grf[inst->src[i].reg] &&
2988	     inst->src[i].reg_offset != 0) {
2989	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2990				inst->src[i].reg_offset - 1);
2991	    inst->src[i].reg_offset = 0;
2992	 }
2993      }
2994   }
2995   this->live_intervals_valid = false;
2996}
2997
2998/**
2999 * Choose accesses from the UNIFORM file to demote to using the pull
3000 * constant buffer.
3001 *
3002 * We allow a fragment shader to have more than the specified minimum
3003 * maximum number of fragment shader uniform components (64).  If
3004 * there are too many of these, they'd fill up all of register space.
3005 * So, this will push some of them out to the pull constant buffer and
3006 * update the program to load them.
3007 */
3008void
3009fs_visitor::setup_pull_constants()
3010{
3011   /* Only allow 16 registers (128 uniform components) as push constants. */
3012   unsigned int max_uniform_components = 16 * 8;
3013   if (c->prog_data.nr_params <= max_uniform_components)
3014      return;
3015
3016   if (c->dispatch_width == 16) {
3017      fail("Pull constants not supported in 16-wide\n");
3018      return;
3019   }
3020
3021   /* Just demote the end of the list.  We could probably do better
3022    * here, demoting things that are rarely used in the program first.
3023    */
3024   int pull_uniform_base = max_uniform_components;
3025   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
3026
3027   foreach_iter(exec_list_iterator, iter, this->instructions) {
3028      fs_inst *inst = (fs_inst *)iter.get();
3029
3030      for (int i = 0; i < 3; i++) {
3031	 if (inst->src[i].file != UNIFORM)
3032	    continue;
3033
3034	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
3035	 if (uniform_nr < pull_uniform_base)
3036	    continue;
3037
3038	 fs_reg dst = fs_reg(this, glsl_type::float_type);
3039	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
3040					      dst);
3041	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
3042	 pull->ir = inst->ir;
3043	 pull->annotation = inst->annotation;
3044	 pull->base_mrf = 14;
3045	 pull->mlen = 1;
3046
3047	 inst->insert_before(pull);
3048
3049	 inst->src[i].file = GRF;
3050	 inst->src[i].reg = dst.reg;
3051	 inst->src[i].reg_offset = 0;
3052	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
3053      }
3054   }
3055
3056   for (int i = 0; i < pull_uniform_count; i++) {
3057      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
3058      c->prog_data.pull_param_convert[i] =
3059	 c->prog_data.param_convert[pull_uniform_base + i];
3060   }
3061   c->prog_data.nr_params -= pull_uniform_count;
3062   c->prog_data.nr_pull_params = pull_uniform_count;
3063}
3064
3065void
3066fs_visitor::calculate_live_intervals()
3067{
3068   int num_vars = this->virtual_grf_next;
3069   int *def = ralloc_array(mem_ctx, int, num_vars);
3070   int *use = ralloc_array(mem_ctx, int, num_vars);
3071   int loop_depth = 0;
3072   int loop_start = 0;
3073   int bb_header_ip = 0;
3074
3075   if (this->live_intervals_valid)
3076      return;
3077
3078   for (int i = 0; i < num_vars; i++) {
3079      def[i] = MAX_INSTRUCTION;
3080      use[i] = -1;
3081   }
3082
3083   int ip = 0;
3084   foreach_iter(exec_list_iterator, iter, this->instructions) {
3085      fs_inst *inst = (fs_inst *)iter.get();
3086
3087      if (inst->opcode == BRW_OPCODE_DO) {
3088	 if (loop_depth++ == 0)
3089	    loop_start = ip;
3090      } else if (inst->opcode == BRW_OPCODE_WHILE) {
3091	 loop_depth--;
3092
3093	 if (loop_depth == 0) {
3094	    /* Patches up the use of vars marked for being live across
3095	     * the whole loop.
3096	     */
3097	    for (int i = 0; i < num_vars; i++) {
3098	       if (use[i] == loop_start) {
3099		  use[i] = ip;
3100	       }
3101	    }
3102	 }
3103      } else {
3104	 for (unsigned int i = 0; i < 3; i++) {
3105	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
3106	       int reg = inst->src[i].reg;
3107
3108	       if (!loop_depth) {
3109		  use[reg] = ip;
3110	       } else {
3111		  def[reg] = MIN2(loop_start, def[reg]);
3112		  use[reg] = loop_start;
3113
3114		  /* Nobody else is going to go smash our start to
3115		   * later in the loop now, because def[reg] now
3116		   * points before the bb header.
3117		   */
3118	       }
3119	    }
3120	 }
3121	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
3122	    int reg = inst->dst.reg;
3123
3124	    if (!loop_depth) {
3125	       def[reg] = MIN2(def[reg], ip);
3126	    } else {
3127	       def[reg] = MIN2(def[reg], loop_start);
3128	    }
3129	 }
3130      }
3131
3132      ip++;
3133
3134      /* Set the basic block header IP.  This is used for determining
3135       * if a complete def of single-register virtual GRF in a loop
3136       * dominates a use in the same basic block.  It's a quick way to
3137       * reduce the live interval range of most register used in a
3138       * loop.
3139       */
3140      if (inst->opcode == BRW_OPCODE_IF ||
3141	  inst->opcode == BRW_OPCODE_ELSE ||
3142	  inst->opcode == BRW_OPCODE_ENDIF ||
3143	  inst->opcode == BRW_OPCODE_DO ||
3144	  inst->opcode == BRW_OPCODE_WHILE ||
3145	  inst->opcode == BRW_OPCODE_BREAK ||
3146	  inst->opcode == BRW_OPCODE_CONTINUE) {
3147	 bb_header_ip = ip;
3148      }
3149   }
3150
3151   ralloc_free(this->virtual_grf_def);
3152   ralloc_free(this->virtual_grf_use);
3153   this->virtual_grf_def = def;
3154   this->virtual_grf_use = use;
3155
3156   this->live_intervals_valid = true;
3157}
3158
3159/**
3160 * Attempts to move immediate constants into the immediate
3161 * constant slot of following instructions.
3162 *
3163 * Immediate constants are a bit tricky -- they have to be in the last
3164 * operand slot, you can't do abs/negate on them,
3165 */
3166
3167bool
3168fs_visitor::propagate_constants()
3169{
3170   bool progress = false;
3171
3172   calculate_live_intervals();
3173
3174   foreach_iter(exec_list_iterator, iter, this->instructions) {
3175      fs_inst *inst = (fs_inst *)iter.get();
3176
3177      if (inst->opcode != BRW_OPCODE_MOV ||
3178	  inst->predicated ||
3179	  inst->dst.file != GRF || inst->src[0].file != IMM ||
3180	  inst->dst.type != inst->src[0].type ||
3181	  (c->dispatch_width == 16 &&
3182	   (inst->force_uncompressed || inst->force_sechalf)))
3183	 continue;
3184
3185      /* Don't bother with cases where we should have had the
3186       * operation on the constant folded in GLSL already.
3187       */
3188      if (inst->saturate)
3189	 continue;
3190
3191      /* Found a move of a constant to a GRF.  Find anything else using the GRF
3192       * before it's written, and replace it with the constant if we can.
3193       */
3194      exec_list_iterator scan_iter = iter;
3195      scan_iter.next();
3196      for (; scan_iter.has_next(); scan_iter.next()) {
3197	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3198
3199	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3200	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3201	     scan_inst->opcode == BRW_OPCODE_ELSE ||
3202	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3203	    break;
3204	 }
3205
3206	 for (int i = 2; i >= 0; i--) {
3207	    if (scan_inst->src[i].file != GRF ||
3208		scan_inst->src[i].reg != inst->dst.reg ||
3209		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
3210	       continue;
3211
3212	    /* Don't bother with cases where we should have had the
3213	     * operation on the constant folded in GLSL already.
3214	     */
3215	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
3216	       continue;
3217
3218	    switch (scan_inst->opcode) {
3219	    case BRW_OPCODE_MOV:
3220	       scan_inst->src[i] = inst->src[0];
3221	       progress = true;
3222	       break;
3223
3224	    case BRW_OPCODE_MUL:
3225	    case BRW_OPCODE_ADD:
3226	       if (i == 1) {
3227		  scan_inst->src[i] = inst->src[0];
3228		  progress = true;
3229	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
3230		  /* Fit this constant in by commuting the operands */
3231		  scan_inst->src[0] = scan_inst->src[1];
3232		  scan_inst->src[1] = inst->src[0];
3233		  progress = true;
3234	       }
3235	       break;
3236
3237	    case BRW_OPCODE_CMP:
3238	       if (i == 1) {
3239		  scan_inst->src[i] = inst->src[0];
3240		  progress = true;
3241	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
3242		  uint32_t new_cmod;
3243
3244		  new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
3245		  if (new_cmod != ~0u) {
3246		     /* Fit this constant in by swapping the operands and
3247		      * flipping the test
3248		      */
3249		     scan_inst->src[0] = scan_inst->src[1];
3250		     scan_inst->src[1] = inst->src[0];
3251		     scan_inst->conditional_mod = new_cmod;
3252		     progress = true;
3253		  }
3254	       }
3255	       break;
3256
3257	    case BRW_OPCODE_SEL:
3258	       if (i == 1) {
3259		  scan_inst->src[i] = inst->src[0];
3260		  progress = true;
3261	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
3262		  /* Fit this constant in by swapping the operands and
3263		   * flipping the predicate
3264		   */
3265		  scan_inst->src[0] = scan_inst->src[1];
3266		  scan_inst->src[1] = inst->src[0];
3267		  scan_inst->predicate_inverse = !scan_inst->predicate_inverse;
3268		  progress = true;
3269	       }
3270	       break;
3271	    }
3272	 }
3273
3274	 if (scan_inst->dst.file == GRF &&
3275	     scan_inst->dst.reg == inst->dst.reg &&
3276	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3277	      scan_inst->is_tex())) {
3278	    break;
3279	 }
3280      }
3281   }
3282
3283   if (progress)
3284       this->live_intervals_valid = false;
3285
3286   return progress;
3287}
3288/**
3289 * Must be called after calculate_live_intervales() to remove unused
3290 * writes to registers -- register allocation will fail otherwise
3291 * because something deffed but not used won't be considered to
3292 * interfere with other regs.
3293 */
3294bool
3295fs_visitor::dead_code_eliminate()
3296{
3297   bool progress = false;
3298   int pc = 0;
3299
3300   calculate_live_intervals();
3301
3302   foreach_iter(exec_list_iterator, iter, this->instructions) {
3303      fs_inst *inst = (fs_inst *)iter.get();
3304
3305      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
3306	 inst->remove();
3307	 progress = true;
3308      }
3309
3310      pc++;
3311   }
3312
3313   if (progress)
3314      live_intervals_valid = false;
3315
3316   return progress;
3317}
3318
3319bool
3320fs_visitor::register_coalesce()
3321{
3322   bool progress = false;
3323   int if_depth = 0;
3324   int loop_depth = 0;
3325
3326   foreach_iter(exec_list_iterator, iter, this->instructions) {
3327      fs_inst *inst = (fs_inst *)iter.get();
3328
3329      /* Make sure that we dominate the instructions we're going to
3330       * scan for interfering with our coalescing, or we won't have
3331       * scanned enough to see if anything interferes with our
3332       * coalescing.  We don't dominate the following instructions if
3333       * we're in a loop or an if block.
3334       */
3335      switch (inst->opcode) {
3336      case BRW_OPCODE_DO:
3337	 loop_depth++;
3338	 break;
3339      case BRW_OPCODE_WHILE:
3340	 loop_depth--;
3341	 break;
3342      case BRW_OPCODE_IF:
3343	 if_depth++;
3344	 break;
3345      case BRW_OPCODE_ENDIF:
3346	 if_depth--;
3347	 break;
3348      }
3349      if (loop_depth || if_depth)
3350	 continue;
3351
3352      if (inst->opcode != BRW_OPCODE_MOV ||
3353	  inst->predicated ||
3354	  inst->saturate ||
3355	  inst->dst.file != GRF || inst->src[0].file != GRF ||
3356	  inst->dst.type != inst->src[0].type)
3357	 continue;
3358
3359      bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
3360
3361      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
3362       * them: check for no writes to either one until the exit of the
3363       * program.
3364       */
3365      bool interfered = false;
3366      exec_list_iterator scan_iter = iter;
3367      scan_iter.next();
3368      for (; scan_iter.has_next(); scan_iter.next()) {
3369	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3370
3371	 if (scan_inst->dst.file == GRF) {
3372	    if (scan_inst->dst.reg == inst->dst.reg &&
3373		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3374		 scan_inst->is_tex())) {
3375	       interfered = true;
3376	       break;
3377	    }
3378	    if (scan_inst->dst.reg == inst->src[0].reg &&
3379		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3380		 scan_inst->is_tex())) {
3381	       interfered = true;
3382	       break;
3383	    }
3384	 }
3385
3386	 /* The gen6 MATH instruction can't handle source modifiers, so avoid
3387	  * coalescing those for now.  We should do something more specific.
3388	  */
3389	 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) {
3390	    interfered = true;
3391	    break;
3392	 }
3393      }
3394      if (interfered) {
3395	 continue;
3396      }
3397
3398      /* Rewrite the later usage to point at the source of the move to
3399       * be removed.
3400       */
3401      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3402	   scan_iter.next()) {
3403	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3404
3405	 for (int i = 0; i < 3; i++) {
3406	    if (scan_inst->src[i].file == GRF &&
3407		scan_inst->src[i].reg == inst->dst.reg &&
3408		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3409	       scan_inst->src[i].reg = inst->src[0].reg;
3410	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3411	       scan_inst->src[i].abs |= inst->src[0].abs;
3412	       scan_inst->src[i].negate ^= inst->src[0].negate;
3413	       scan_inst->src[i].smear = inst->src[0].smear;
3414	    }
3415	 }
3416      }
3417
3418      inst->remove();
3419      progress = true;
3420   }
3421
3422   if (progress)
3423      live_intervals_valid = false;
3424
3425   return progress;
3426}
3427
3428
3429bool
3430fs_visitor::compute_to_mrf()
3431{
3432   bool progress = false;
3433   int next_ip = 0;
3434
3435   calculate_live_intervals();
3436
3437   foreach_iter(exec_list_iterator, iter, this->instructions) {
3438      fs_inst *inst = (fs_inst *)iter.get();
3439
3440      int ip = next_ip;
3441      next_ip++;
3442
3443      if (inst->opcode != BRW_OPCODE_MOV ||
3444	  inst->predicated ||
3445	  inst->dst.file != MRF || inst->src[0].file != GRF ||
3446	  inst->dst.type != inst->src[0].type ||
3447	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3448	 continue;
3449
3450      /* Work out which hardware MRF registers are written by this
3451       * instruction.
3452       */
3453      int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
3454      int mrf_high;
3455      if (inst->dst.hw_reg & BRW_MRF_COMPR4) {
3456	 mrf_high = mrf_low + 4;
3457      } else if (c->dispatch_width == 16 &&
3458		 (!inst->force_uncompressed && !inst->force_sechalf)) {
3459	 mrf_high = mrf_low + 1;
3460      } else {
3461	 mrf_high = mrf_low;
3462      }
3463
3464      /* Can't compute-to-MRF this GRF if someone else was going to
3465       * read it later.
3466       */
3467      if (this->virtual_grf_use[inst->src[0].reg] > ip)
3468	 continue;
3469
3470      /* Found a move of a GRF to a MRF.  Let's see if we can go
3471       * rewrite the thing that made this GRF to write into the MRF.
3472       */
3473      fs_inst *scan_inst;
3474      for (scan_inst = (fs_inst *)inst->prev;
3475	   scan_inst->prev != NULL;
3476	   scan_inst = (fs_inst *)scan_inst->prev) {
3477	 if (scan_inst->dst.file == GRF &&
3478	     scan_inst->dst.reg == inst->src[0].reg) {
3479	    /* Found the last thing to write our reg we want to turn
3480	     * into a compute-to-MRF.
3481	     */
3482
3483	    if (scan_inst->is_tex()) {
3484	       /* texturing writes several continuous regs, so we can't
3485		* compute-to-mrf that.
3486		*/
3487	       break;
3488	    }
3489
3490	    /* If it's predicated, it (probably) didn't populate all
3491	     * the channels.  We might be able to rewrite everything
3492	     * that writes that reg, but it would require smarter
3493	     * tracking to delay the rewriting until complete success.
3494	     */
3495	    if (scan_inst->predicated)
3496	       break;
3497
3498	    /* If it's half of register setup and not the same half as
3499	     * our MOV we're trying to remove, bail for now.
3500	     */
3501	    if (scan_inst->force_uncompressed != inst->force_uncompressed ||
3502		scan_inst->force_sechalf != inst->force_sechalf) {
3503	       break;
3504	    }
3505
3506	    /* SEND instructions can't have MRF as a destination. */
3507	    if (scan_inst->mlen)
3508	       break;
3509
3510	    if (intel->gen >= 6) {
3511	       /* gen6 math instructions must have the destination be
3512		* GRF, so no compute-to-MRF for them.
3513		*/
3514	       if (scan_inst->is_math()) {
3515		  break;
3516	       }
3517	    }
3518
3519	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3520	       /* Found the creator of our MRF's source value. */
3521	       scan_inst->dst.file = MRF;
3522	       scan_inst->dst.hw_reg = inst->dst.hw_reg;
3523	       scan_inst->saturate |= inst->saturate;
3524	       inst->remove();
3525	       progress = true;
3526	    }
3527	    break;
3528	 }
3529
3530	 /* We don't handle flow control here.  Most computation of
3531	  * values that end up in MRFs are shortly before the MRF
3532	  * write anyway.
3533	  */
3534	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3535	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3536	     scan_inst->opcode == BRW_OPCODE_ELSE ||
3537	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3538	    break;
3539	 }
3540
3541	 /* You can't read from an MRF, so if someone else reads our
3542	  * MRF's source GRF that we wanted to rewrite, that stops us.
3543	  */
3544	 bool interfered = false;
3545	 for (int i = 0; i < 3; i++) {
3546	    if (scan_inst->src[i].file == GRF &&
3547		scan_inst->src[i].reg == inst->src[0].reg &&
3548		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3549	       interfered = true;
3550	    }
3551	 }
3552	 if (interfered)
3553	    break;
3554
3555	 if (scan_inst->dst.file == MRF) {
3556	    /* If somebody else writes our MRF here, we can't
3557	     * compute-to-MRF before that.
3558	     */
3559	    int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4;
3560	    int scan_mrf_high;
3561
3562	    if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) {
3563	       scan_mrf_high = scan_mrf_low + 4;
3564	    } else if (c->dispatch_width == 16 &&
3565		       (!scan_inst->force_uncompressed &&
3566			!scan_inst->force_sechalf)) {
3567	       scan_mrf_high = scan_mrf_low + 1;
3568	    } else {
3569	       scan_mrf_high = scan_mrf_low;
3570	    }
3571
3572	    if (mrf_low == scan_mrf_low ||
3573		mrf_low == scan_mrf_high ||
3574		mrf_high == scan_mrf_low ||
3575		mrf_high == scan_mrf_high) {
3576	       break;
3577	    }
3578	 }
3579
3580	 if (scan_inst->mlen > 0) {
3581	    /* Found a SEND instruction, which means that there are
3582	     * live values in MRFs from base_mrf to base_mrf +
3583	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3584	     * above it.
3585	     */
3586	    if (mrf_low >= scan_inst->base_mrf &&
3587		mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
3588	       break;
3589	    }
3590	    if (mrf_high >= scan_inst->base_mrf &&
3591		mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
3592	       break;
3593	    }
3594	 }
3595      }
3596   }
3597
3598   return progress;
3599}
3600
3601/**
3602 * Walks through basic blocks, locking for repeated MRF writes and
3603 * removing the later ones.
3604 */
3605bool
3606fs_visitor::remove_duplicate_mrf_writes()
3607{
3608   fs_inst *last_mrf_move[16];
3609   bool progress = false;
3610
3611   /* Need to update the MRF tracking for compressed instructions. */
3612   if (c->dispatch_width == 16)
3613      return false;
3614
3615   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3616
3617   foreach_iter(exec_list_iterator, iter, this->instructions) {
3618      fs_inst *inst = (fs_inst *)iter.get();
3619
3620      switch (inst->opcode) {
3621      case BRW_OPCODE_DO:
3622      case BRW_OPCODE_WHILE:
3623      case BRW_OPCODE_IF:
3624      case BRW_OPCODE_ELSE:
3625      case BRW_OPCODE_ENDIF:
3626	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3627	 continue;
3628      default:
3629	 break;
3630      }
3631
3632      if (inst->opcode == BRW_OPCODE_MOV &&
3633	  inst->dst.file == MRF) {
3634	 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3635	 if (prev_inst && inst->equals(prev_inst)) {
3636	    inst->remove();
3637	    progress = true;
3638	    continue;
3639	 }
3640      }
3641
3642      /* Clear out the last-write records for MRFs that were overwritten. */
3643      if (inst->dst.file == MRF) {
3644	 last_mrf_move[inst->dst.hw_reg] = NULL;
3645      }
3646
3647      if (inst->mlen > 0) {
3648	 /* Found a SEND instruction, which will include two or fewer
3649	  * implied MRF writes.  We could do better here.
3650	  */
3651	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3652	    last_mrf_move[inst->base_mrf + i] = NULL;
3653	 }
3654      }
3655
3656      /* Clear out any MRF move records whose sources got overwritten. */
3657      if (inst->dst.file == GRF) {
3658	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3659	    if (last_mrf_move[i] &&
3660		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3661	       last_mrf_move[i] = NULL;
3662	    }
3663	 }
3664      }
3665
3666      if (inst->opcode == BRW_OPCODE_MOV &&
3667	  inst->dst.file == MRF &&
3668	  inst->src[0].file == GRF &&
3669	  !inst->predicated) {
3670	 last_mrf_move[inst->dst.hw_reg] = inst;
3671      }
3672   }
3673
3674   return progress;
3675}
3676
3677bool
3678fs_visitor::virtual_grf_interferes(int a, int b)
3679{
3680   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3681   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3682
3683   /* We can't handle dead register writes here, without iterating
3684    * over the whole instruction stream to find every single dead
3685    * write to that register to compare to the live interval of the
3686    * other register.  Just assert that dead_code_eliminate() has been
3687    * called.
3688    */
3689   assert((this->virtual_grf_use[a] != -1 ||
3690	   this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3691	  (this->virtual_grf_use[b] != -1 ||
3692	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
3693
3694   /* If the register is used to store 16 values of less than float
3695    * size (only the case for pixel_[xy]), then we can't allocate
3696    * another dword-sized thing to that register that would be used in
3697    * the same instruction.  This is because when the GPU decodes (for
3698    * example):
3699    *
3700    * (declare (in ) vec4 gl_FragCoord@0x97766a0)
3701    * add(16)         g6<1>F          g6<8,8,1>UW     0.5F { align1 compr };
3702    *
3703    * it's actually processed as:
3704    * add(8)         g6<1>F          g6<8,8,1>UW     0.5F { align1 };
3705    * add(8)         g7<1>F          g6.8<8,8,1>UW   0.5F { align1 sechalf };
3706    *
3707    * so our second half values in g6 got overwritten in the first
3708    * half.
3709    */
3710   if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
3711				   this->pixel_x.reg == b ||
3712				   this->pixel_y.reg == a ||
3713				   this->pixel_y.reg == b)) {
3714      return start <= end;
3715   }
3716
3717   return start < end;
3718}
3719
3720static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3721{
3722   struct brw_reg brw_reg;
3723
3724   switch (reg->file) {
3725   case GRF:
3726   case ARF:
3727   case MRF:
3728      if (reg->smear == -1) {
3729	 brw_reg = brw_vec8_reg(reg->file,
3730				reg->hw_reg, 0);
3731      } else {
3732	 brw_reg = brw_vec1_reg(reg->file,
3733				reg->hw_reg, reg->smear);
3734      }
3735      brw_reg = retype(brw_reg, reg->type);
3736      if (reg->sechalf)
3737	 brw_reg = sechalf(brw_reg);
3738      break;
3739   case IMM:
3740      switch (reg->type) {
3741      case BRW_REGISTER_TYPE_F:
3742	 brw_reg = brw_imm_f(reg->imm.f);
3743	 break;
3744      case BRW_REGISTER_TYPE_D:
3745	 brw_reg = brw_imm_d(reg->imm.i);
3746	 break;
3747      case BRW_REGISTER_TYPE_UD:
3748	 brw_reg = brw_imm_ud(reg->imm.u);
3749	 break;
3750      default:
3751	 assert(!"not reached");
3752	 brw_reg = brw_null_reg();
3753	 break;
3754      }
3755      break;
3756   case FIXED_HW_REG:
3757      brw_reg = reg->fixed_hw_reg;
3758      break;
3759   case BAD_FILE:
3760      /* Probably unused. */
3761      brw_reg = brw_null_reg();
3762      break;
3763   case UNIFORM:
3764      assert(!"not reached");
3765      brw_reg = brw_null_reg();
3766      break;
3767   default:
3768      assert(!"not reached");
3769      brw_reg = brw_null_reg();
3770      break;
3771   }
3772   if (reg->abs)
3773      brw_reg = brw_abs(brw_reg);
3774   if (reg->negate)
3775      brw_reg = negate(brw_reg);
3776
3777   return brw_reg;
3778}
3779
3780void
3781fs_visitor::generate_code()
3782{
3783   int last_native_inst = p->nr_insn;
3784   const char *last_annotation_string = NULL;
3785   ir_instruction *last_annotation_ir = NULL;
3786
3787   int loop_stack_array_size = 16;
3788   int loop_stack_depth = 0;
3789   brw_instruction **loop_stack =
3790      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
3791   int *if_depth_in_loop =
3792      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
3793
3794
3795   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3796      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
3797	     ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
3798   }
3799
3800   foreach_iter(exec_list_iterator, iter, this->instructions) {
3801      fs_inst *inst = (fs_inst *)iter.get();
3802      struct brw_reg src[3], dst;
3803
3804      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3805	 if (last_annotation_ir != inst->ir) {
3806	    last_annotation_ir = inst->ir;
3807	    if (last_annotation_ir) {
3808	       printf("   ");
3809	       last_annotation_ir->print();
3810	       printf("\n");
3811	    }
3812	 }
3813	 if (last_annotation_string != inst->annotation) {
3814	    last_annotation_string = inst->annotation;
3815	    if (last_annotation_string)
3816	       printf("   %s\n", last_annotation_string);
3817	 }
3818      }
3819
3820      for (unsigned int i = 0; i < 3; i++) {
3821	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3822      }
3823      dst = brw_reg_from_fs_reg(&inst->dst);
3824
3825      brw_set_conditionalmod(p, inst->conditional_mod);
3826      brw_set_predicate_control(p, inst->predicated);
3827      brw_set_predicate_inverse(p, inst->predicate_inverse);
3828      brw_set_saturate(p, inst->saturate);
3829
3830      if (inst->force_uncompressed || c->dispatch_width == 8) {
3831	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
3832      } else if (inst->force_sechalf) {
3833	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
3834      } else {
3835	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
3836      }
3837
3838      switch (inst->opcode) {
3839      case BRW_OPCODE_MOV:
3840	 brw_MOV(p, dst, src[0]);
3841	 break;
3842      case BRW_OPCODE_ADD:
3843	 brw_ADD(p, dst, src[0], src[1]);
3844	 break;
3845      case BRW_OPCODE_MUL:
3846	 brw_MUL(p, dst, src[0], src[1]);
3847	 break;
3848
3849      case BRW_OPCODE_FRC:
3850	 brw_FRC(p, dst, src[0]);
3851	 break;
3852      case BRW_OPCODE_RNDD:
3853	 brw_RNDD(p, dst, src[0]);
3854	 break;
3855      case BRW_OPCODE_RNDE:
3856	 brw_RNDE(p, dst, src[0]);
3857	 break;
3858      case BRW_OPCODE_RNDZ:
3859	 brw_RNDZ(p, dst, src[0]);
3860	 break;
3861
3862      case BRW_OPCODE_AND:
3863	 brw_AND(p, dst, src[0], src[1]);
3864	 break;
3865      case BRW_OPCODE_OR:
3866	 brw_OR(p, dst, src[0], src[1]);
3867	 break;
3868      case BRW_OPCODE_XOR:
3869	 brw_XOR(p, dst, src[0], src[1]);
3870	 break;
3871      case BRW_OPCODE_NOT:
3872	 brw_NOT(p, dst, src[0]);
3873	 break;
3874      case BRW_OPCODE_ASR:
3875	 brw_ASR(p, dst, src[0], src[1]);
3876	 break;
3877      case BRW_OPCODE_SHR:
3878	 brw_SHR(p, dst, src[0], src[1]);
3879	 break;
3880      case BRW_OPCODE_SHL:
3881	 brw_SHL(p, dst, src[0], src[1]);
3882	 break;
3883
3884      case BRW_OPCODE_CMP:
3885	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3886	 break;
3887      case BRW_OPCODE_SEL:
3888	 brw_SEL(p, dst, src[0], src[1]);
3889	 break;
3890
3891      case BRW_OPCODE_IF:
3892	 if (inst->src[0].file != BAD_FILE) {
3893	    assert(intel->gen >= 6);
3894	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
3895	 } else {
3896	    brw_IF(p, BRW_EXECUTE_8);
3897	 }
3898	 if_depth_in_loop[loop_stack_depth]++;
3899	 break;
3900
3901      case BRW_OPCODE_ELSE:
3902	 brw_ELSE(p);
3903	 break;
3904      case BRW_OPCODE_ENDIF:
3905	 brw_ENDIF(p);
3906	 if_depth_in_loop[loop_stack_depth]--;
3907	 break;
3908
3909      case BRW_OPCODE_DO:
3910	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3911	 if (loop_stack_array_size <= loop_stack_depth) {
3912	    loop_stack_array_size *= 2;
3913	    loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
3914				  loop_stack_array_size);
3915	    if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
3916				        loop_stack_array_size);
3917	 }
3918	 if_depth_in_loop[loop_stack_depth] = 0;
3919	 break;
3920
3921      case BRW_OPCODE_BREAK:
3922	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3923	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3924	 break;
3925      case BRW_OPCODE_CONTINUE:
3926	 /* FINISHME: We need to write the loop instruction support still. */
3927	 if (intel->gen >= 6)
3928	    gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
3929	 else
3930	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3931	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3932	 break;
3933
3934      case BRW_OPCODE_WHILE: {
3935	 struct brw_instruction *inst0, *inst1;
3936	 GLuint br = 1;
3937
3938	 if (intel->gen >= 5)
3939	    br = 2;
3940
3941	 assert(loop_stack_depth > 0);
3942	 loop_stack_depth--;
3943	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3944	 if (intel->gen < 6) {
3945	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
3946	    while (inst0 > loop_stack[loop_stack_depth]) {
3947	       inst0--;
3948	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3949		   inst0->bits3.if_else.jump_count == 0) {
3950		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3951	    }
3952	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3953			inst0->bits3.if_else.jump_count == 0) {
3954		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3955	       }
3956	    }
3957	 }
3958      }
3959	 break;
3960
3961      case FS_OPCODE_RCP:
3962      case FS_OPCODE_RSQ:
3963      case FS_OPCODE_SQRT:
3964      case FS_OPCODE_EXP2:
3965      case FS_OPCODE_LOG2:
3966      case FS_OPCODE_POW:
3967      case FS_OPCODE_SIN:
3968      case FS_OPCODE_COS:
3969	 generate_math(inst, dst, src);
3970	 break;
3971      case FS_OPCODE_PIXEL_X:
3972	 generate_pixel_xy(dst, true);
3973	 break;
3974      case FS_OPCODE_PIXEL_Y:
3975	 generate_pixel_xy(dst, false);
3976	 break;
3977      case FS_OPCODE_CINTERP:
3978	 brw_MOV(p, dst, src[0]);
3979	 break;
3980      case FS_OPCODE_LINTERP:
3981	 generate_linterp(inst, dst, src);
3982	 break;
3983      case FS_OPCODE_TEX:
3984      case FS_OPCODE_TXB:
3985      case FS_OPCODE_TXD:
3986      case FS_OPCODE_TXL:
3987	 generate_tex(inst, dst, src[0]);
3988	 break;
3989      case FS_OPCODE_DISCARD_NOT:
3990	 generate_discard_not(inst, dst);
3991	 break;
3992      case FS_OPCODE_DISCARD_AND:
3993	 generate_discard_and(inst, src[0]);
3994	 break;
3995      case FS_OPCODE_DDX:
3996	 generate_ddx(inst, dst, src[0]);
3997	 break;
3998      case FS_OPCODE_DDY:
3999	 generate_ddy(inst, dst, src[0]);
4000	 break;
4001
4002      case FS_OPCODE_SPILL:
4003	 generate_spill(inst, src[0]);
4004	 break;
4005
4006      case FS_OPCODE_UNSPILL:
4007	 generate_unspill(inst, dst);
4008	 break;
4009
4010      case FS_OPCODE_PULL_CONSTANT_LOAD:
4011	 generate_pull_constant_load(inst, dst);
4012	 break;
4013
4014      case FS_OPCODE_FB_WRITE:
4015	 generate_fb_write(inst);
4016	 break;
4017      default:
4018	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
4019	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
4020			  brw_opcodes[inst->opcode].name);
4021	 } else {
4022	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
4023	 }
4024	 fail("unsupported opcode in FS\n");
4025      }
4026
4027      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4028	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
4029	    if (0) {
4030	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
4031		      ((uint32_t *)&p->store[i])[3],
4032		      ((uint32_t *)&p->store[i])[2],
4033		      ((uint32_t *)&p->store[i])[1],
4034		      ((uint32_t *)&p->store[i])[0]);
4035	    }
4036	    brw_disasm(stdout, &p->store[i], intel->gen);
4037	 }
4038      }
4039
4040      last_native_inst = p->nr_insn;
4041   }
4042
4043   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4044      printf("\n");
4045   }
4046
4047   ralloc_free(loop_stack);
4048   ralloc_free(if_depth_in_loop);
4049
4050   brw_set_uip_jip(p);
4051
4052   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
4053    * emit issues, it doesn't get the jump distances into the output,
4054    * which is often something we want to debug.  So this is here in
4055    * case you're doing that.
4056    */
4057   if (0) {
4058      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4059	 for (unsigned int i = 0; i < p->nr_insn; i++) {
4060	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
4061		   ((uint32_t *)&p->store[i])[3],
4062		   ((uint32_t *)&p->store[i])[2],
4063		   ((uint32_t *)&p->store[i])[1],
4064		   ((uint32_t *)&p->store[i])[0]);
4065	    brw_disasm(stdout, &p->store[i], intel->gen);
4066	 }
4067      }
4068   }
4069}
4070
4071bool
4072fs_visitor::run()
4073{
4074   uint32_t prog_offset_16 = 0;
4075   uint32_t orig_nr_params = c->prog_data.nr_params;
4076
4077   brw_wm_payload_setup(brw, c);
4078
4079   if (c->dispatch_width == 16) {
4080      /* align to 64 byte boundary. */
4081      while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
4082	 brw_NOP(p);
4083      }
4084
4085      /* Save off the start of this 16-wide program in case we succeed. */
4086      prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
4087
4088      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
4089   }
4090
4091   if (0) {
4092      emit_dummy_fs();
4093   } else {
4094      calculate_urb_setup();
4095      if (intel->gen < 6)
4096	 emit_interpolation_setup_gen4();
4097      else
4098	 emit_interpolation_setup_gen6();
4099
4100      /* Generate FS IR for main().  (the visitor only descends into
4101       * functions called "main").
4102       */
4103      foreach_iter(exec_list_iterator, iter, *shader->ir) {
4104	 ir_instruction *ir = (ir_instruction *)iter.get();
4105	 base_ir = ir;
4106	 ir->accept(this);
4107      }
4108
4109      emit_fb_writes();
4110
4111      split_virtual_grfs();
4112
4113      setup_paramvalues_refs();
4114      setup_pull_constants();
4115
4116      bool progress;
4117      do {
4118	 progress = false;
4119
4120	 progress = remove_duplicate_mrf_writes() || progress;
4121
4122	 progress = propagate_constants() || progress;
4123	 progress = register_coalesce() || progress;
4124	 progress = compute_to_mrf() || progress;
4125	 progress = dead_code_eliminate() || progress;
4126      } while (progress);
4127
4128      schedule_instructions();
4129
4130      assign_curb_setup();
4131      assign_urb_setup();
4132
4133      if (0) {
4134	 /* Debug of register spilling: Go spill everything. */
4135	 int virtual_grf_count = virtual_grf_next;
4136	 for (int i = 1; i < virtual_grf_count; i++) {
4137	    spill_reg(i);
4138	 }
4139      }
4140
4141      if (0)
4142	 assign_regs_trivial();
4143      else {
4144	 while (!assign_regs()) {
4145	    if (failed)
4146	       break;
4147	 }
4148      }
4149   }
4150   assert(force_uncompressed_stack == 0);
4151   assert(force_sechalf_stack == 0);
4152
4153   if (failed)
4154      return false;
4155
4156   generate_code();
4157
4158   if (c->dispatch_width == 8) {
4159      c->prog_data.total_grf = grf_used;
4160   } else {
4161      c->prog_data.total_grf_16 = grf_used;
4162      c->prog_data.prog_offset_16 = prog_offset_16;
4163
4164      /* Make sure we didn't try to sneak in an extra uniform */
4165      assert(orig_nr_params == c->prog_data.nr_params);
4166   }
4167
4168   return !failed;
4169}
4170
4171bool
4172brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
4173{
4174   struct intel_context *intel = &brw->intel;
4175   struct gl_context *ctx = &intel->ctx;
4176   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
4177
4178   if (!prog)
4179      return false;
4180
4181   struct brw_shader *shader =
4182     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4183   if (!shader)
4184      return false;
4185
4186   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4187      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
4188      _mesa_print_ir(shader->ir, NULL);
4189      printf("\n\n");
4190   }
4191
4192   /* Now the main event: Visit the shader IR and generate our FS IR for it.
4193    */
4194   c->dispatch_width = 8;
4195
4196   fs_visitor v(c, shader);
4197   if (!v.run()) {
4198      /* FINISHME: Cleanly fail, test at link time, etc. */
4199      assert(!"not reached");
4200      return false;
4201   }
4202
4203   if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
4204      c->dispatch_width = 16;
4205      fs_visitor v2(c, shader);
4206      v2.import_uniforms(v.variable_ht);
4207      v2.run();
4208   }
4209
4210   c->prog_data.dispatch_width = 8;
4211
4212   return true;
4213}
4214