brw_fs.cpp revision 2279156fe7ac9718533b8b0de90ae96100486680
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28extern "C" {
29
30#include <sys/types.h>
31
32#include "main/macros.h"
33#include "main/shaderobj.h"
34#include "main/uniforms.h"
35#include "program/prog_parameter.h"
36#include "program/prog_print.h"
37#include "program/prog_optimize.h"
38#include "program/register_allocate.h"
39#include "program/sampler.h"
40#include "program/hash_table.h"
41#include "brw_context.h"
42#include "brw_eu.h"
43#include "brw_wm.h"
44}
45#include "brw_fs.h"
46#include "../glsl/glsl_types.h"
47#include "../glsl/ir_optimization.h"
48#include "../glsl/ir_print_visitor.h"
49
50#define MAX_INSTRUCTION (1 << 30)
51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53struct gl_shader *
54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55{
56   struct brw_shader *shader;
57
58   shader = rzalloc(NULL, struct brw_shader);
59   if (shader) {
60      shader->base.Type = type;
61      shader->base.Name = name;
62      _mesa_init_shader(ctx, &shader->base);
63   }
64
65   return &shader->base;
66}
67
68struct gl_shader_program *
69brw_new_shader_program(struct gl_context *ctx, GLuint name)
70{
71   struct brw_shader_program *prog;
72   prog = rzalloc(NULL, struct brw_shader_program);
73   if (prog) {
74      prog->base.Name = name;
75      _mesa_init_shader_program(ctx, &prog->base);
76   }
77   return &prog->base;
78}
79
80GLboolean
81brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader)
82{
83   if (!_mesa_ir_compile_shader(ctx, shader))
84      return GL_FALSE;
85
86   return GL_TRUE;
87}
88
89GLboolean
90brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
91{
92   struct brw_context *brw = brw_context(ctx);
93   struct intel_context *intel = &brw->intel;
94
95   struct brw_shader *shader =
96      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
97   if (shader != NULL) {
98      void *mem_ctx = ralloc_context(NULL);
99      bool progress;
100
101      if (shader->ir)
102	 ralloc_free(shader->ir);
103      shader->ir = new(shader) exec_list;
104      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
105
106      do_mat_op_to_vec(shader->ir);
107      lower_instructions(shader->ir,
108			 MOD_TO_FRACT |
109			 DIV_TO_MUL_RCP |
110			 SUB_TO_ADD_NEG |
111			 EXP_TO_EXP2 |
112			 LOG_TO_LOG2);
113
114      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
115       * if-statements need to be flattened.
116       */
117      if (intel->gen < 6)
118	 lower_if_to_cond_assign(shader->ir, 16);
119
120      do_lower_texture_projection(shader->ir);
121      do_vec_index_to_cond_assign(shader->ir);
122      brw_do_cubemap_normalize(shader->ir);
123      lower_noise(shader->ir);
124      lower_quadop_vector(shader->ir, false);
125      lower_variable_index_to_cond_assign(shader->ir,
126					  GL_TRUE, /* input */
127					  GL_TRUE, /* output */
128					  GL_TRUE, /* temp */
129					  GL_TRUE /* uniform */
130					  );
131
132      do {
133	 progress = false;
134
135	 brw_do_channel_expressions(shader->ir);
136	 brw_do_vector_splitting(shader->ir);
137
138	 progress = do_lower_jumps(shader->ir, true, true,
139				   true, /* main return */
140				   false, /* continue */
141				   false /* loops */
142				   ) || progress;
143
144	 progress = do_common_optimization(shader->ir, true, 32) || progress;
145      } while (progress);
146
147      validate_ir_tree(shader->ir);
148
149      reparent_ir(shader->ir, shader->ir);
150      ralloc_free(mem_ctx);
151   }
152
153   if (!_mesa_ir_link_shader(ctx, prog))
154      return GL_FALSE;
155
156   return GL_TRUE;
157}
158
159static int
160type_size(const struct glsl_type *type)
161{
162   unsigned int size, i;
163
164   switch (type->base_type) {
165   case GLSL_TYPE_UINT:
166   case GLSL_TYPE_INT:
167   case GLSL_TYPE_FLOAT:
168   case GLSL_TYPE_BOOL:
169      return type->components();
170   case GLSL_TYPE_ARRAY:
171      return type_size(type->fields.array) * type->length;
172   case GLSL_TYPE_STRUCT:
173      size = 0;
174      for (i = 0; i < type->length; i++) {
175	 size += type_size(type->fields.structure[i].type);
176      }
177      return size;
178   case GLSL_TYPE_SAMPLER:
179      /* Samplers take up no register space, since they're baked in at
180       * link time.
181       */
182      return 0;
183   default:
184      assert(!"not reached");
185      return 0;
186   }
187}
188
189/**
190 * Returns how many MRFs an FS opcode will write over.
191 *
192 * Note that this is not the 0 or 1 implied writes in an actual gen
193 * instruction -- the FS opcodes often generate MOVs in addition.
194 */
195int
196fs_visitor::implied_mrf_writes(fs_inst *inst)
197{
198   if (inst->mlen == 0)
199      return 0;
200
201   switch (inst->opcode) {
202   case FS_OPCODE_RCP:
203   case FS_OPCODE_RSQ:
204   case FS_OPCODE_SQRT:
205   case FS_OPCODE_EXP2:
206   case FS_OPCODE_LOG2:
207   case FS_OPCODE_SIN:
208   case FS_OPCODE_COS:
209      return 1;
210   case FS_OPCODE_POW:
211      return 2;
212   case FS_OPCODE_TEX:
213   case FS_OPCODE_TXB:
214   case FS_OPCODE_TXD:
215   case FS_OPCODE_TXL:
216      return 1;
217   case FS_OPCODE_FB_WRITE:
218      return 2;
219   case FS_OPCODE_PULL_CONSTANT_LOAD:
220   case FS_OPCODE_UNSPILL:
221      return 1;
222   case FS_OPCODE_SPILL:
223      return 2;
224   default:
225      assert(!"not reached");
226      return inst->mlen;
227   }
228}
229
230int
231fs_visitor::virtual_grf_alloc(int size)
232{
233   if (virtual_grf_array_size <= virtual_grf_next) {
234      if (virtual_grf_array_size == 0)
235	 virtual_grf_array_size = 16;
236      else
237	 virtual_grf_array_size *= 2;
238      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
239				   virtual_grf_array_size);
240
241      /* This slot is always unused. */
242      virtual_grf_sizes[0] = 0;
243   }
244   virtual_grf_sizes[virtual_grf_next] = size;
245   return virtual_grf_next++;
246}
247
248/** Fixed HW reg constructor. */
249fs_reg::fs_reg(enum register_file file, int hw_reg)
250{
251   init();
252   this->file = file;
253   this->hw_reg = hw_reg;
254   this->type = BRW_REGISTER_TYPE_F;
255}
256
257/** Fixed HW reg constructor. */
258fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
259{
260   init();
261   this->file = file;
262   this->hw_reg = hw_reg;
263   this->type = type;
264}
265
266int
267brw_type_for_base_type(const struct glsl_type *type)
268{
269   switch (type->base_type) {
270   case GLSL_TYPE_FLOAT:
271      return BRW_REGISTER_TYPE_F;
272   case GLSL_TYPE_INT:
273   case GLSL_TYPE_BOOL:
274      return BRW_REGISTER_TYPE_D;
275   case GLSL_TYPE_UINT:
276      return BRW_REGISTER_TYPE_UD;
277   case GLSL_TYPE_ARRAY:
278   case GLSL_TYPE_STRUCT:
279   case GLSL_TYPE_SAMPLER:
280      /* These should be overridden with the type of the member when
281       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
282       * way to trip up if we don't.
283       */
284      return BRW_REGISTER_TYPE_UD;
285   default:
286      assert(!"not reached");
287      return BRW_REGISTER_TYPE_F;
288   }
289}
290
291/** Automatic reg constructor. */
292fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
293{
294   init();
295
296   this->file = GRF;
297   this->reg = v->virtual_grf_alloc(type_size(type));
298   this->reg_offset = 0;
299   this->type = brw_type_for_base_type(type);
300}
301
302fs_reg *
303fs_visitor::variable_storage(ir_variable *var)
304{
305   return (fs_reg *)hash_table_find(this->variable_ht, var);
306}
307
308/* Our support for uniforms is piggy-backed on the struct
309 * gl_fragment_program, because that's where the values actually
310 * get stored, rather than in some global gl_shader_program uniform
311 * store.
312 */
313int
314fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
315{
316   unsigned int offset = 0;
317
318   if (type->is_matrix()) {
319      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
320							type->vector_elements,
321							1);
322
323      for (unsigned int i = 0; i < type->matrix_columns; i++) {
324	 offset += setup_uniform_values(loc + offset, column);
325      }
326
327      return offset;
328   }
329
330   switch (type->base_type) {
331   case GLSL_TYPE_FLOAT:
332   case GLSL_TYPE_UINT:
333   case GLSL_TYPE_INT:
334   case GLSL_TYPE_BOOL:
335      for (unsigned int i = 0; i < type->vector_elements; i++) {
336	 unsigned int param = c->prog_data.nr_params++;
337
338	 assert(param < ARRAY_SIZE(c->prog_data.param));
339
340	 switch (type->base_type) {
341	 case GLSL_TYPE_FLOAT:
342	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
343	    break;
344	 case GLSL_TYPE_UINT:
345	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
346	    break;
347	 case GLSL_TYPE_INT:
348	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
349	    break;
350	 case GLSL_TYPE_BOOL:
351	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
352	    break;
353	 default:
354	    assert(!"not reached");
355	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
356	    break;
357	 }
358	 this->param_index[param] = loc;
359	 this->param_offset[param] = i;
360      }
361      return 1;
362
363   case GLSL_TYPE_STRUCT:
364      for (unsigned int i = 0; i < type->length; i++) {
365	 offset += setup_uniform_values(loc + offset,
366					type->fields.structure[i].type);
367      }
368      return offset;
369
370   case GLSL_TYPE_ARRAY:
371      for (unsigned int i = 0; i < type->length; i++) {
372	 offset += setup_uniform_values(loc + offset, type->fields.array);
373      }
374      return offset;
375
376   case GLSL_TYPE_SAMPLER:
377      /* The sampler takes up a slot, but we don't use any values from it. */
378      return 1;
379
380   default:
381      assert(!"not reached");
382      return 0;
383   }
384}
385
386
387/* Our support for builtin uniforms is even scarier than non-builtin.
388 * It sits on top of the PROG_STATE_VAR parameters that are
389 * automatically updated from GL context state.
390 */
391void
392fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
393{
394   const struct gl_builtin_uniform_desc *statevar = NULL;
395
396   for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
397      statevar = &_mesa_builtin_uniform_desc[i];
398      if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
399	 break;
400   }
401
402   if (!statevar->name) {
403      this->fail = true;
404      printf("Failed to find builtin uniform `%s'\n", ir->name);
405      return;
406   }
407
408   int array_count;
409   if (ir->type->is_array()) {
410      array_count = ir->type->length;
411   } else {
412      array_count = 1;
413   }
414
415   for (int a = 0; a < array_count; a++) {
416      for (unsigned int i = 0; i < statevar->num_elements; i++) {
417	 struct gl_builtin_uniform_element *element = &statevar->elements[i];
418	 int tokens[STATE_LENGTH];
419
420	 memcpy(tokens, element->tokens, sizeof(element->tokens));
421	 if (ir->type->is_array()) {
422	    tokens[1] = a;
423	 }
424
425	 /* This state reference has already been setup by ir_to_mesa,
426	  * but we'll get the same index back here.
427	  */
428	 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
429					       (gl_state_index *)tokens);
430
431	 /* Add each of the unique swizzles of the element as a
432	  * parameter.  This'll end up matching the expected layout of
433	  * the array/matrix/structure we're trying to fill in.
434	  */
435	 int last_swiz = -1;
436	 for (unsigned int i = 0; i < 4; i++) {
437	    int swiz = GET_SWZ(element->swizzle, i);
438	    if (swiz == last_swiz)
439	       break;
440	    last_swiz = swiz;
441
442	    c->prog_data.param_convert[c->prog_data.nr_params] =
443	       PARAM_NO_CONVERT;
444	    this->param_index[c->prog_data.nr_params] = index;
445	    this->param_offset[c->prog_data.nr_params] = swiz;
446	    c->prog_data.nr_params++;
447	 }
448      }
449   }
450}
451
452fs_reg *
453fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
454{
455   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
456   fs_reg wpos = *reg;
457   fs_reg neg_y = this->pixel_y;
458   neg_y.negate = true;
459   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
460
461   /* gl_FragCoord.x */
462   if (ir->pixel_center_integer) {
463      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x));
464   } else {
465      emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)));
466   }
467   wpos.reg_offset++;
468
469   /* gl_FragCoord.y */
470   if (!flip && ir->pixel_center_integer) {
471      emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y));
472   } else {
473      fs_reg pixel_y = this->pixel_y;
474      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
475
476      if (flip) {
477	 pixel_y.negate = true;
478	 offset += c->key.drawable_height - 1.0;
479      }
480
481      emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)));
482   }
483   wpos.reg_offset++;
484
485   /* gl_FragCoord.z */
486   if (intel->gen >= 6) {
487      emit(fs_inst(BRW_OPCODE_MOV, wpos,
488		   fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
489   } else {
490      emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
491		   interp_reg(FRAG_ATTRIB_WPOS, 2)));
492   }
493   wpos.reg_offset++;
494
495   /* gl_FragCoord.w: Already set up in emit_interpolation */
496   emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w));
497
498   return reg;
499}
500
501fs_reg *
502fs_visitor::emit_general_interpolation(ir_variable *ir)
503{
504   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
505   /* Interpolation is always in floating point regs. */
506   reg->type = BRW_REGISTER_TYPE_F;
507   fs_reg attr = *reg;
508
509   unsigned int array_elements;
510   const glsl_type *type;
511
512   if (ir->type->is_array()) {
513      array_elements = ir->type->length;
514      if (array_elements == 0) {
515	 this->fail = true;
516      }
517      type = ir->type->fields.array;
518   } else {
519      array_elements = 1;
520      type = ir->type;
521   }
522
523   int location = ir->location;
524   for (unsigned int i = 0; i < array_elements; i++) {
525      for (unsigned int j = 0; j < type->matrix_columns; j++) {
526	 if (urb_setup[location] == -1) {
527	    /* If there's no incoming setup data for this slot, don't
528	     * emit interpolation for it.
529	     */
530	    attr.reg_offset += type->vector_elements;
531	    location++;
532	    continue;
533	 }
534
535	 if (c->key.flat_shade && (location == FRAG_ATTRIB_COL0 ||
536				   location == FRAG_ATTRIB_COL1)) {
537	    /* Constant interpolation (flat shading) case. The SF has
538	     * handed us defined values in only the constant offset
539	     * field of the setup reg.
540	     */
541	    for (unsigned int c = 0; c < type->vector_elements; c++) {
542	       struct brw_reg interp = interp_reg(location, c);
543	       interp = suboffset(interp, 3);
544	       emit(fs_inst(FS_OPCODE_CINTERP, attr, fs_reg(interp)));
545	       attr.reg_offset++;
546	    }
547	 } else {
548	    /* Perspective interpolation case. */
549	    for (unsigned int c = 0; c < type->vector_elements; c++) {
550	       struct brw_reg interp = interp_reg(location, c);
551	       emit(fs_inst(FS_OPCODE_LINTERP,
552			    attr,
553			    this->delta_x,
554			    this->delta_y,
555			    fs_reg(interp)));
556	       attr.reg_offset++;
557	    }
558
559	    if (intel->gen < 6) {
560	       attr.reg_offset -= type->vector_elements;
561	       for (unsigned int c = 0; c < type->vector_elements; c++) {
562		  emit(fs_inst(BRW_OPCODE_MUL,
563			       attr,
564			       attr,
565			       this->pixel_w));
566		  attr.reg_offset++;
567	       }
568	    }
569	 }
570	 location++;
571      }
572   }
573
574   return reg;
575}
576
577fs_reg *
578fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
579{
580   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
581
582   /* The frontfacing comes in as a bit in the thread payload. */
583   if (intel->gen >= 6) {
584      emit(fs_inst(BRW_OPCODE_ASR,
585		   *reg,
586		   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
587		   fs_reg(15)));
588      emit(fs_inst(BRW_OPCODE_NOT,
589		   *reg,
590		   *reg));
591      emit(fs_inst(BRW_OPCODE_AND,
592		   *reg,
593		   *reg,
594		   fs_reg(1)));
595   } else {
596      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
597      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
598       * us front face
599       */
600      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP,
601				   *reg,
602				   fs_reg(r1_6ud),
603				   fs_reg(1u << 31)));
604      inst->conditional_mod = BRW_CONDITIONAL_L;
605      emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)));
606   }
607
608   return reg;
609}
610
611fs_inst *
612fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
613{
614   switch (opcode) {
615   case FS_OPCODE_RCP:
616   case FS_OPCODE_RSQ:
617   case FS_OPCODE_SQRT:
618   case FS_OPCODE_EXP2:
619   case FS_OPCODE_LOG2:
620   case FS_OPCODE_SIN:
621   case FS_OPCODE_COS:
622      break;
623   default:
624      assert(!"not reached: bad math opcode");
625      return NULL;
626   }
627
628   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
629    * might be able to do better by doing execsize = 1 math and then
630    * expanding that result out, but we would need to be careful with
631    * masking.
632    *
633    * The hardware ignores source modifiers (negate and abs) on math
634    * instructions, so we also move to a temp to set those up.
635    */
636   if (intel->gen >= 6 && (src.file == UNIFORM ||
637			   src.abs ||
638			   src.negate)) {
639      fs_reg expanded = fs_reg(this, glsl_type::float_type);
640      emit(fs_inst(BRW_OPCODE_MOV, expanded, src));
641      src = expanded;
642   }
643
644   fs_inst *inst = emit(fs_inst(opcode, dst, src));
645
646   if (intel->gen < 6) {
647      inst->base_mrf = 2;
648      inst->mlen = 1;
649   }
650
651   return inst;
652}
653
654fs_inst *
655fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
656{
657   int base_mrf = 2;
658   fs_inst *inst;
659
660   assert(opcode == FS_OPCODE_POW);
661
662   if (intel->gen >= 6) {
663      /* Can't do hstride == 0 args to gen6 math, so expand it out.
664       *
665       * The hardware ignores source modifiers (negate and abs) on math
666       * instructions, so we also move to a temp to set those up.
667       */
668      if (src0.file == UNIFORM || src0.abs || src0.negate) {
669	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
670	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src0));
671	 src0 = expanded;
672      }
673
674      if (src1.file == UNIFORM || src1.abs || src1.negate) {
675	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
676	 emit(fs_inst(BRW_OPCODE_MOV, expanded, src1));
677	 src1 = expanded;
678      }
679
680      inst = emit(fs_inst(opcode, dst, src0, src1));
681   } else {
682      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1));
683      inst = emit(fs_inst(opcode, dst, src0, reg_null_f));
684
685      inst->base_mrf = base_mrf;
686      inst->mlen = 2;
687   }
688   return inst;
689}
690
691void
692fs_visitor::visit(ir_variable *ir)
693{
694   fs_reg *reg = NULL;
695
696   if (variable_storage(ir))
697      return;
698
699   if (strcmp(ir->name, "gl_FragColor") == 0) {
700      this->frag_color = ir;
701   } else if (strcmp(ir->name, "gl_FragData") == 0) {
702      this->frag_data = ir;
703   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
704      this->frag_depth = ir;
705   }
706
707   if (ir->mode == ir_var_in) {
708      if (!strcmp(ir->name, "gl_FragCoord")) {
709	 reg = emit_fragcoord_interpolation(ir);
710      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
711	 reg = emit_frontfacing_interpolation(ir);
712      } else {
713	 reg = emit_general_interpolation(ir);
714      }
715      assert(reg);
716      hash_table_insert(this->variable_ht, reg, ir);
717      return;
718   }
719
720   if (ir->mode == ir_var_uniform) {
721      int param_index = c->prog_data.nr_params;
722
723      if (!strncmp(ir->name, "gl_", 3)) {
724	 setup_builtin_uniform_values(ir);
725      } else {
726	 setup_uniform_values(ir->location, ir->type);
727      }
728
729      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
730      reg->type = brw_type_for_base_type(ir->type);
731   }
732
733   if (!reg)
734      reg = new(this->mem_ctx) fs_reg(this, ir->type);
735
736   hash_table_insert(this->variable_ht, reg, ir);
737}
738
739void
740fs_visitor::visit(ir_dereference_variable *ir)
741{
742   fs_reg *reg = variable_storage(ir->var);
743   this->result = *reg;
744}
745
746void
747fs_visitor::visit(ir_dereference_record *ir)
748{
749   const glsl_type *struct_type = ir->record->type;
750
751   ir->record->accept(this);
752
753   unsigned int offset = 0;
754   for (unsigned int i = 0; i < struct_type->length; i++) {
755      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
756	 break;
757      offset += type_size(struct_type->fields.structure[i].type);
758   }
759   this->result.reg_offset += offset;
760   this->result.type = brw_type_for_base_type(ir->type);
761}
762
763void
764fs_visitor::visit(ir_dereference_array *ir)
765{
766   ir_constant *index;
767   int element_size;
768
769   ir->array->accept(this);
770   index = ir->array_index->as_constant();
771
772   element_size = type_size(ir->type);
773   this->result.type = brw_type_for_base_type(ir->type);
774
775   if (index) {
776      assert(this->result.file == UNIFORM ||
777	     (this->result.file == GRF &&
778	      this->result.reg != 0));
779      this->result.reg_offset += index->value.i[0] * element_size;
780   } else {
781      assert(!"FINISHME: non-constant array element");
782   }
783}
784
785/* Instruction selection: Produce a MOV.sat instead of
786 * MIN(MAX(val, 0), 1) when possible.
787 */
788bool
789fs_visitor::try_emit_saturate(ir_expression *ir)
790{
791   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
792
793   if (!sat_val)
794      return false;
795
796   sat_val->accept(this);
797   fs_reg src = this->result;
798
799   this->result = fs_reg(this, ir->type);
800   fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, src));
801   inst->saturate = true;
802
803   return true;
804}
805
806static uint32_t
807brw_conditional_for_comparison(unsigned int op)
808{
809   switch (op) {
810   case ir_binop_less:
811      return BRW_CONDITIONAL_L;
812   case ir_binop_greater:
813      return BRW_CONDITIONAL_G;
814   case ir_binop_lequal:
815      return BRW_CONDITIONAL_LE;
816   case ir_binop_gequal:
817      return BRW_CONDITIONAL_GE;
818   case ir_binop_equal:
819   case ir_binop_all_equal: /* same as equal for scalars */
820      return BRW_CONDITIONAL_Z;
821   case ir_binop_nequal:
822   case ir_binop_any_nequal: /* same as nequal for scalars */
823      return BRW_CONDITIONAL_NZ;
824   default:
825      assert(!"not reached: bad operation for comparison");
826      return BRW_CONDITIONAL_NZ;
827   }
828}
829
830void
831fs_visitor::visit(ir_expression *ir)
832{
833   unsigned int operand;
834   fs_reg op[2], temp;
835   fs_inst *inst;
836
837   assert(ir->get_num_operands() <= 2);
838
839   if (try_emit_saturate(ir))
840      return;
841
842   for (operand = 0; operand < ir->get_num_operands(); operand++) {
843      ir->operands[operand]->accept(this);
844      if (this->result.file == BAD_FILE) {
845	 ir_print_visitor v;
846	 printf("Failed to get tree for expression operand:\n");
847	 ir->operands[operand]->accept(&v);
848	 this->fail = true;
849      }
850      op[operand] = this->result;
851
852      /* Matrix expression operands should have been broken down to vector
853       * operations already.
854       */
855      assert(!ir->operands[operand]->type->is_matrix());
856      /* And then those vector operands should have been broken down to scalar.
857       */
858      assert(!ir->operands[operand]->type->is_vector());
859   }
860
861   /* Storage for our result.  If our result goes into an assignment, it will
862    * just get copy-propagated out, so no worries.
863    */
864   this->result = fs_reg(this, ir->type);
865
866   switch (ir->operation) {
867   case ir_unop_logic_not:
868      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
869       * ones complement of the whole register, not just bit 0.
870       */
871      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)));
872      break;
873   case ir_unop_neg:
874      op[0].negate = !op[0].negate;
875      this->result = op[0];
876      break;
877   case ir_unop_abs:
878      op[0].abs = true;
879      op[0].negate = false;
880      this->result = op[0];
881      break;
882   case ir_unop_sign:
883      temp = fs_reg(this, ir->type);
884
885      emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)));
886
887      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
888      inst->conditional_mod = BRW_CONDITIONAL_G;
889      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)));
890      inst->predicated = true;
891
892      inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
893      inst->conditional_mod = BRW_CONDITIONAL_L;
894      inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)));
895      inst->predicated = true;
896
897      break;
898   case ir_unop_rcp:
899      emit_math(FS_OPCODE_RCP, this->result, op[0]);
900      break;
901
902   case ir_unop_exp2:
903      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
904      break;
905   case ir_unop_log2:
906      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
907      break;
908   case ir_unop_exp:
909   case ir_unop_log:
910      assert(!"not reached: should be handled by ir_explog_to_explog2");
911      break;
912   case ir_unop_sin:
913   case ir_unop_sin_reduced:
914      emit_math(FS_OPCODE_SIN, this->result, op[0]);
915      break;
916   case ir_unop_cos:
917   case ir_unop_cos_reduced:
918      emit_math(FS_OPCODE_COS, this->result, op[0]);
919      break;
920
921   case ir_unop_dFdx:
922      emit(fs_inst(FS_OPCODE_DDX, this->result, op[0]));
923      break;
924   case ir_unop_dFdy:
925      emit(fs_inst(FS_OPCODE_DDY, this->result, op[0]));
926      break;
927
928   case ir_binop_add:
929      emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1]));
930      break;
931   case ir_binop_sub:
932      assert(!"not reached: should be handled by ir_sub_to_add_neg");
933      break;
934
935   case ir_binop_mul:
936      emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1]));
937      break;
938   case ir_binop_div:
939      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
940      break;
941   case ir_binop_mod:
942      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
943      break;
944
945   case ir_binop_less:
946   case ir_binop_greater:
947   case ir_binop_lequal:
948   case ir_binop_gequal:
949   case ir_binop_equal:
950   case ir_binop_all_equal:
951   case ir_binop_nequal:
952   case ir_binop_any_nequal:
953      temp = this->result;
954      /* original gen4 does implicit conversion before comparison. */
955      if (intel->gen < 5)
956	 temp.type = op[0].type;
957
958      inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], op[1]));
959      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
960      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
961      break;
962
963   case ir_binop_logic_xor:
964      emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
965      break;
966
967   case ir_binop_logic_or:
968      emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
969      break;
970
971   case ir_binop_logic_and:
972      emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
973      break;
974
975   case ir_binop_dot:
976   case ir_unop_any:
977      assert(!"not reached: should be handled by brw_fs_channel_expressions");
978      break;
979
980   case ir_unop_noise:
981      assert(!"not reached: should be handled by lower_noise");
982      break;
983
984   case ir_quadop_vector:
985      assert(!"not reached: should be handled by lower_quadop_vector");
986      break;
987
988   case ir_unop_sqrt:
989      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
990      break;
991
992   case ir_unop_rsq:
993      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
994      break;
995
996   case ir_unop_i2f:
997   case ir_unop_b2f:
998   case ir_unop_b2i:
999   case ir_unop_f2i:
1000      emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
1001      break;
1002   case ir_unop_f2b:
1003   case ir_unop_i2b:
1004      temp = this->result;
1005      /* original gen4 does implicit conversion before comparison. */
1006      if (intel->gen < 5)
1007	 temp.type = op[0].type;
1008
1009      inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)));
1010      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1011      inst = emit(fs_inst(BRW_OPCODE_AND, this->result,
1012			  this->result, fs_reg(1)));
1013      break;
1014
1015   case ir_unop_trunc:
1016      emit(fs_inst(BRW_OPCODE_RNDZ, this->result, op[0]));
1017      break;
1018   case ir_unop_ceil:
1019      op[0].negate = !op[0].negate;
1020      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1021      this->result.negate = true;
1022      break;
1023   case ir_unop_floor:
1024      inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1025      break;
1026   case ir_unop_fract:
1027      inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0]));
1028      break;
1029   case ir_unop_round_even:
1030      emit(fs_inst(BRW_OPCODE_RNDE, this->result, op[0]));
1031      break;
1032
1033   case ir_binop_min:
1034      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1035      inst->conditional_mod = BRW_CONDITIONAL_L;
1036
1037      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
1038      inst->predicated = true;
1039      break;
1040   case ir_binop_max:
1041      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1042      inst->conditional_mod = BRW_CONDITIONAL_G;
1043
1044      inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
1045      inst->predicated = true;
1046      break;
1047
1048   case ir_binop_pow:
1049      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1050      break;
1051
1052   case ir_unop_bit_not:
1053      inst = emit(fs_inst(BRW_OPCODE_NOT, this->result, op[0]));
1054      break;
1055   case ir_binop_bit_and:
1056      inst = emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
1057      break;
1058   case ir_binop_bit_xor:
1059      inst = emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
1060      break;
1061   case ir_binop_bit_or:
1062      inst = emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
1063      break;
1064
1065   case ir_unop_u2f:
1066   case ir_binop_lshift:
1067   case ir_binop_rshift:
1068      assert(!"GLSL 1.30 features unsupported");
1069      break;
1070   }
1071}
1072
1073void
1074fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1075				   const glsl_type *type, bool predicated)
1076{
1077   switch (type->base_type) {
1078   case GLSL_TYPE_FLOAT:
1079   case GLSL_TYPE_UINT:
1080   case GLSL_TYPE_INT:
1081   case GLSL_TYPE_BOOL:
1082      for (unsigned int i = 0; i < type->components(); i++) {
1083	 l.type = brw_type_for_base_type(type);
1084	 r.type = brw_type_for_base_type(type);
1085
1086	 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1087	 inst->predicated = predicated;
1088
1089	 l.reg_offset++;
1090	 r.reg_offset++;
1091      }
1092      break;
1093   case GLSL_TYPE_ARRAY:
1094      for (unsigned int i = 0; i < type->length; i++) {
1095	 emit_assignment_writes(l, r, type->fields.array, predicated);
1096      }
1097      break;
1098
1099   case GLSL_TYPE_STRUCT:
1100      for (unsigned int i = 0; i < type->length; i++) {
1101	 emit_assignment_writes(l, r, type->fields.structure[i].type,
1102				predicated);
1103      }
1104      break;
1105
1106   case GLSL_TYPE_SAMPLER:
1107      break;
1108
1109   default:
1110      assert(!"not reached");
1111      break;
1112   }
1113}
1114
1115void
1116fs_visitor::visit(ir_assignment *ir)
1117{
1118   struct fs_reg l, r;
1119   fs_inst *inst;
1120
1121   /* FINISHME: arrays on the lhs */
1122   ir->lhs->accept(this);
1123   l = this->result;
1124
1125   ir->rhs->accept(this);
1126   r = this->result;
1127
1128   assert(l.file != BAD_FILE);
1129   assert(r.file != BAD_FILE);
1130
1131   if (ir->condition) {
1132      emit_bool_to_cond_code(ir->condition);
1133   }
1134
1135   if (ir->lhs->type->is_scalar() ||
1136       ir->lhs->type->is_vector()) {
1137      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1138	 if (ir->write_mask & (1 << i)) {
1139	    inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1140	    if (ir->condition)
1141	       inst->predicated = true;
1142	    r.reg_offset++;
1143	 }
1144	 l.reg_offset++;
1145      }
1146   } else {
1147      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1148   }
1149}
1150
1151fs_inst *
1152fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1153{
1154   int mlen;
1155   int base_mrf = 1;
1156   bool simd16 = false;
1157   fs_reg orig_dst;
1158
1159   /* g0 header. */
1160   mlen = 1;
1161
1162   if (ir->shadow_comparitor) {
1163      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1164	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1165		      coordinate));
1166	 coordinate.reg_offset++;
1167      }
1168      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1169      mlen += 3;
1170
1171      if (ir->op == ir_tex) {
1172	 /* There's no plain shadow compare message, so we use shadow
1173	  * compare with a bias of 0.0.
1174	  */
1175	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1176		      fs_reg(0.0f)));
1177	 mlen++;
1178      } else if (ir->op == ir_txb) {
1179	 ir->lod_info.bias->accept(this);
1180	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1181		      this->result));
1182	 mlen++;
1183      } else {
1184	 assert(ir->op == ir_txl);
1185	 ir->lod_info.lod->accept(this);
1186	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1187		      this->result));
1188	 mlen++;
1189      }
1190
1191      ir->shadow_comparitor->accept(this);
1192      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1193      mlen++;
1194   } else if (ir->op == ir_tex) {
1195      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1196	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1197		      coordinate));
1198	 coordinate.reg_offset++;
1199      }
1200      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1201      mlen += 3;
1202   } else if (ir->op == ir_txd) {
1203      assert(!"TXD isn't supported on gen4 yet.");
1204   } else {
1205      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1206       * instructions.  We'll need to do SIMD16 here.
1207       */
1208      assert(ir->op == ir_txb || ir->op == ir_txl);
1209
1210      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1211	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2),
1212		      coordinate));
1213	 coordinate.reg_offset++;
1214      }
1215
1216      /* lod/bias appears after u/v/r. */
1217      mlen += 6;
1218
1219      if (ir->op == ir_txb) {
1220	 ir->lod_info.bias->accept(this);
1221	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1222		      this->result));
1223	 mlen++;
1224      } else {
1225	 ir->lod_info.lod->accept(this);
1226	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1227		      this->result));
1228	 mlen++;
1229      }
1230
1231      /* The unused upper half. */
1232      mlen++;
1233
1234      /* Now, since we're doing simd16, the return is 2 interleaved
1235       * vec4s where the odd-indexed ones are junk. We'll need to move
1236       * this weirdness around to the expected layout.
1237       */
1238      simd16 = true;
1239      orig_dst = dst;
1240      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1241						       2));
1242      dst.type = BRW_REGISTER_TYPE_F;
1243   }
1244
1245   fs_inst *inst = NULL;
1246   switch (ir->op) {
1247   case ir_tex:
1248      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1249      break;
1250   case ir_txb:
1251      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1252      break;
1253   case ir_txl:
1254      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1255      break;
1256   case ir_txd:
1257      inst = emit(fs_inst(FS_OPCODE_TXD, dst));
1258      break;
1259   case ir_txf:
1260      assert(!"GLSL 1.30 features unsupported");
1261      break;
1262   }
1263   inst->base_mrf = base_mrf;
1264   inst->mlen = mlen;
1265
1266   if (simd16) {
1267      for (int i = 0; i < 4; i++) {
1268	 emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst));
1269	 orig_dst.reg_offset++;
1270	 dst.reg_offset += 2;
1271      }
1272   }
1273
1274   return inst;
1275}
1276
1277fs_inst *
1278fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1279{
1280   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1281    * optional parameters like shadow comparitor or LOD bias.  If
1282    * optional parameters aren't present, those base slots are
1283    * optional and don't need to be included in the message.
1284    *
1285    * We don't fill in the unnecessary slots regardless, which may
1286    * look surprising in the disassembly.
1287    */
1288   int mlen = 1; /* g0 header always present. */
1289   int base_mrf = 1;
1290
1291   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1292      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1293		   coordinate));
1294      coordinate.reg_offset++;
1295   }
1296   mlen += ir->coordinate->type->vector_elements;
1297
1298   if (ir->shadow_comparitor) {
1299      mlen = MAX2(mlen, 5);
1300
1301      ir->shadow_comparitor->accept(this);
1302      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1303      mlen++;
1304   }
1305
1306   fs_inst *inst = NULL;
1307   switch (ir->op) {
1308   case ir_tex:
1309      inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1310      break;
1311   case ir_txb:
1312      ir->lod_info.bias->accept(this);
1313      mlen = MAX2(mlen, 5);
1314      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1315      mlen++;
1316
1317      inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1318      break;
1319   case ir_txl:
1320      ir->lod_info.lod->accept(this);
1321      mlen = MAX2(mlen, 5);
1322      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1323      mlen++;
1324
1325      inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1326      break;
1327   case ir_txd:
1328   case ir_txf:
1329      assert(!"GLSL 1.30 features unsupported");
1330      break;
1331   }
1332   inst->base_mrf = base_mrf;
1333   inst->mlen = mlen;
1334
1335   return inst;
1336}
1337
1338void
1339fs_visitor::visit(ir_texture *ir)
1340{
1341   int sampler;
1342   fs_inst *inst = NULL;
1343
1344   ir->coordinate->accept(this);
1345   fs_reg coordinate = this->result;
1346
1347   if (ir->offset != NULL) {
1348      ir_constant *offset = ir->offset->as_constant();
1349      assert(offset != NULL);
1350
1351      signed char offsets[3];
1352      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1353	 offsets[i] = (signed char) offset->value.i[i];
1354
1355      /* Combine all three offsets into a single unsigned dword:
1356       *
1357       *    bits 11:8 - U Offset (X component)
1358       *    bits  7:4 - V Offset (Y component)
1359       *    bits  3:0 - R Offset (Z component)
1360       */
1361      unsigned offset_bits = 0;
1362      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1363	 const unsigned shift = 4 * (2 - i);
1364	 offset_bits |= (offsets[i] << shift) & (0xF << shift);
1365      }
1366
1367      /* Explicitly set up the message header by copying g0 to msg reg m1. */
1368      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1369				   fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD)));
1370
1371      /* Then set the offset bits in DWord 2 of the message header. */
1372      emit(fs_inst(BRW_OPCODE_MOV,
1373		   fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1374				 BRW_REGISTER_TYPE_UD)),
1375		   fs_reg(brw_imm_uw(offset_bits))));
1376   }
1377
1378   /* Should be lowered by do_lower_texture_projection */
1379   assert(!ir->projector);
1380
1381   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1382					     ctx->Shader.CurrentFragmentProgram,
1383					     &brw->fragment_program->Base);
1384   sampler = c->fp->program.Base.SamplerUnits[sampler];
1385
1386   /* The 965 requires the EU to do the normalization of GL rectangle
1387    * texture coordinates.  We use the program parameter state
1388    * tracking to get the scaling factor.
1389    */
1390   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1391      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1392      int tokens[STATE_LENGTH] = {
1393	 STATE_INTERNAL,
1394	 STATE_TEXRECT_SCALE,
1395	 sampler,
1396	 0,
1397	 0
1398      };
1399
1400      c->prog_data.param_convert[c->prog_data.nr_params] =
1401	 PARAM_NO_CONVERT;
1402      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1403	 PARAM_NO_CONVERT;
1404
1405      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1406      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1407      GLuint index = _mesa_add_state_reference(params,
1408					       (gl_state_index *)tokens);
1409
1410      this->param_index[c->prog_data.nr_params] = index;
1411      this->param_offset[c->prog_data.nr_params] = 0;
1412      c->prog_data.nr_params++;
1413      this->param_index[c->prog_data.nr_params] = index;
1414      this->param_offset[c->prog_data.nr_params] = 1;
1415      c->prog_data.nr_params++;
1416
1417      fs_reg dst = fs_reg(this, ir->coordinate->type);
1418      fs_reg src = coordinate;
1419      coordinate = dst;
1420
1421      emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_x));
1422      dst.reg_offset++;
1423      src.reg_offset++;
1424      emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_y));
1425   }
1426
1427   /* Writemasking doesn't eliminate channels on SIMD8 texture
1428    * samples, so don't worry about them.
1429    */
1430   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1431
1432   if (intel->gen < 5) {
1433      inst = emit_texture_gen4(ir, dst, coordinate);
1434   } else {
1435      inst = emit_texture_gen5(ir, dst, coordinate);
1436   }
1437
1438   /* If there's an offset, we already set up m1.  To avoid the implied move,
1439    * use the null register.  Otherwise, we want an implied move from g0.
1440    */
1441   if (ir->offset != NULL)
1442      inst->src[0] = fs_reg(brw_null_reg());
1443   else
1444      inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1445
1446   inst->sampler = sampler;
1447
1448   this->result = dst;
1449
1450   if (ir->shadow_comparitor)
1451      inst->shadow_compare = true;
1452
1453   if (ir->type == glsl_type::float_type) {
1454      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1455      assert(ir->sampler->type->sampler_shadow);
1456   } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1457      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1458
1459      for (int i = 0; i < 4; i++) {
1460	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1461	 fs_reg l = swizzle_dst;
1462	 l.reg_offset += i;
1463
1464	 if (swiz == SWIZZLE_ZERO) {
1465	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f)));
1466	 } else if (swiz == SWIZZLE_ONE) {
1467	    emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f)));
1468	 } else {
1469	    fs_reg r = dst;
1470	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1471	    emit(fs_inst(BRW_OPCODE_MOV, l, r));
1472	 }
1473      }
1474      this->result = swizzle_dst;
1475   }
1476}
1477
1478void
1479fs_visitor::visit(ir_swizzle *ir)
1480{
1481   ir->val->accept(this);
1482   fs_reg val = this->result;
1483
1484   if (ir->type->vector_elements == 1) {
1485      this->result.reg_offset += ir->mask.x;
1486      return;
1487   }
1488
1489   fs_reg result = fs_reg(this, ir->type);
1490   this->result = result;
1491
1492   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1493      fs_reg channel = val;
1494      int swiz = 0;
1495
1496      switch (i) {
1497      case 0:
1498	 swiz = ir->mask.x;
1499	 break;
1500      case 1:
1501	 swiz = ir->mask.y;
1502	 break;
1503      case 2:
1504	 swiz = ir->mask.z;
1505	 break;
1506      case 3:
1507	 swiz = ir->mask.w;
1508	 break;
1509      }
1510
1511      channel.reg_offset += swiz;
1512      emit(fs_inst(BRW_OPCODE_MOV, result, channel));
1513      result.reg_offset++;
1514   }
1515}
1516
1517void
1518fs_visitor::visit(ir_discard *ir)
1519{
1520   fs_reg temp = fs_reg(this, glsl_type::uint_type);
1521
1522   assert(ir->condition == NULL); /* FINISHME */
1523
1524   emit(fs_inst(FS_OPCODE_DISCARD_NOT, temp, reg_null_d));
1525   emit(fs_inst(FS_OPCODE_DISCARD_AND, reg_null_d, temp));
1526   kill_emitted = true;
1527}
1528
1529void
1530fs_visitor::visit(ir_constant *ir)
1531{
1532   /* Set this->result to reg at the bottom of the function because some code
1533    * paths will cause this visitor to be applied to other fields.  This will
1534    * cause the value stored in this->result to be modified.
1535    *
1536    * Make reg constant so that it doesn't get accidentally modified along the
1537    * way.  Yes, I actually had this problem. :(
1538    */
1539   const fs_reg reg(this, ir->type);
1540   fs_reg dst_reg = reg;
1541
1542   if (ir->type->is_array()) {
1543      const unsigned size = type_size(ir->type->fields.array);
1544
1545      for (unsigned i = 0; i < ir->type->length; i++) {
1546	 ir->array_elements[i]->accept(this);
1547	 fs_reg src_reg = this->result;
1548
1549	 dst_reg.type = src_reg.type;
1550	 for (unsigned j = 0; j < size; j++) {
1551	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
1552	    src_reg.reg_offset++;
1553	    dst_reg.reg_offset++;
1554	 }
1555      }
1556   } else if (ir->type->is_record()) {
1557      foreach_list(node, &ir->components) {
1558	 ir_instruction *const field = (ir_instruction *) node;
1559	 const unsigned size = type_size(field->type);
1560
1561	 field->accept(this);
1562	 fs_reg src_reg = this->result;
1563
1564	 dst_reg.type = src_reg.type;
1565	 for (unsigned j = 0; j < size; j++) {
1566	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
1567	    src_reg.reg_offset++;
1568	    dst_reg.reg_offset++;
1569	 }
1570      }
1571   } else {
1572      const unsigned size = type_size(ir->type);
1573
1574      for (unsigned i = 0; i < size; i++) {
1575	 switch (ir->type->base_type) {
1576	 case GLSL_TYPE_FLOAT:
1577	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])));
1578	    break;
1579	 case GLSL_TYPE_UINT:
1580	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])));
1581	    break;
1582	 case GLSL_TYPE_INT:
1583	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])));
1584	    break;
1585	 case GLSL_TYPE_BOOL:
1586	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])));
1587	    break;
1588	 default:
1589	    assert(!"Non-float/uint/int/bool constant");
1590	 }
1591	 dst_reg.reg_offset++;
1592      }
1593   }
1594
1595   this->result = reg;
1596}
1597
1598void
1599fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1600{
1601   ir_expression *expr = ir->as_expression();
1602
1603   if (expr) {
1604      fs_reg op[2];
1605      fs_inst *inst;
1606
1607      assert(expr->get_num_operands() <= 2);
1608      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1609	 assert(expr->operands[i]->type->is_scalar());
1610
1611	 expr->operands[i]->accept(this);
1612	 op[i] = this->result;
1613      }
1614
1615      switch (expr->operation) {
1616      case ir_unop_logic_not:
1617	 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)));
1618	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1619	 break;
1620
1621      case ir_binop_logic_xor:
1622	 inst = emit(fs_inst(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]));
1623	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1624	 break;
1625
1626      case ir_binop_logic_or:
1627	 inst = emit(fs_inst(BRW_OPCODE_OR, reg_null_d, op[0], op[1]));
1628	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1629	 break;
1630
1631      case ir_binop_logic_and:
1632	 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], op[1]));
1633	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1634	 break;
1635
1636      case ir_unop_f2b:
1637	 if (intel->gen >= 6) {
1638	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d,
1639				op[0], fs_reg(0.0f)));
1640	 } else {
1641	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_f, op[0]));
1642	 }
1643	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1644	 break;
1645
1646      case ir_unop_i2b:
1647	 if (intel->gen >= 6) {
1648	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)));
1649	 } else {
1650	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0]));
1651	 }
1652	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1653	 break;
1654
1655      case ir_binop_greater:
1656      case ir_binop_gequal:
1657      case ir_binop_less:
1658      case ir_binop_lequal:
1659      case ir_binop_equal:
1660      case ir_binop_all_equal:
1661      case ir_binop_nequal:
1662      case ir_binop_any_nequal:
1663	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]));
1664	 inst->conditional_mod =
1665	    brw_conditional_for_comparison(expr->operation);
1666	 break;
1667
1668      default:
1669	 assert(!"not reached");
1670	 this->fail = true;
1671	 break;
1672      }
1673      return;
1674   }
1675
1676   ir->accept(this);
1677
1678   if (intel->gen >= 6) {
1679      fs_inst *inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d,
1680				   this->result, fs_reg(1)));
1681      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1682   } else {
1683      fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, this->result));
1684      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1685   }
1686}
1687
1688/**
1689 * Emit a gen6 IF statement with the comparison folded into the IF
1690 * instruction.
1691 */
1692void
1693fs_visitor::emit_if_gen6(ir_if *ir)
1694{
1695   ir_expression *expr = ir->condition->as_expression();
1696
1697   if (expr) {
1698      fs_reg op[2];
1699      fs_inst *inst;
1700      fs_reg temp;
1701
1702      assert(expr->get_num_operands() <= 2);
1703      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1704	 assert(expr->operands[i]->type->is_scalar());
1705
1706	 expr->operands[i]->accept(this);
1707	 op[i] = this->result;
1708      }
1709
1710      switch (expr->operation) {
1711      case ir_unop_logic_not:
1712	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(0)));
1713	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1714	 return;
1715
1716      case ir_binop_logic_xor:
1717	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1718	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1719	 return;
1720
1721      case ir_binop_logic_or:
1722	 temp = fs_reg(this, glsl_type::bool_type);
1723	 emit(fs_inst(BRW_OPCODE_OR, temp, op[0], op[1]));
1724	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1725	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1726	 return;
1727
1728      case ir_binop_logic_and:
1729	 temp = fs_reg(this, glsl_type::bool_type);
1730	 emit(fs_inst(BRW_OPCODE_AND, temp, op[0], op[1]));
1731	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1732	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1733	 return;
1734
1735      case ir_unop_f2b:
1736	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)));
1737	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1738	 return;
1739
1740      case ir_unop_i2b:
1741	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1742	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1743	 return;
1744
1745      case ir_binop_greater:
1746      case ir_binop_gequal:
1747      case ir_binop_less:
1748      case ir_binop_lequal:
1749      case ir_binop_equal:
1750      case ir_binop_all_equal:
1751      case ir_binop_nequal:
1752      case ir_binop_any_nequal:
1753	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1754	 inst->conditional_mod =
1755	    brw_conditional_for_comparison(expr->operation);
1756	 return;
1757      default:
1758	 assert(!"not reached");
1759	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1760	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1761	 this->fail = true;
1762	 return;
1763      }
1764      return;
1765   }
1766
1767   ir->condition->accept(this);
1768
1769   fs_inst *inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)));
1770   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1771}
1772
1773void
1774fs_visitor::visit(ir_if *ir)
1775{
1776   fs_inst *inst;
1777
1778   /* Don't point the annotation at the if statement, because then it plus
1779    * the then and else blocks get printed.
1780    */
1781   this->base_ir = ir->condition;
1782
1783   if (intel->gen >= 6) {
1784      emit_if_gen6(ir);
1785   } else {
1786      emit_bool_to_cond_code(ir->condition);
1787
1788      inst = emit(fs_inst(BRW_OPCODE_IF));
1789      inst->predicated = true;
1790   }
1791
1792   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1793      ir_instruction *ir = (ir_instruction *)iter.get();
1794      this->base_ir = ir;
1795
1796      ir->accept(this);
1797   }
1798
1799   if (!ir->else_instructions.is_empty()) {
1800      emit(fs_inst(BRW_OPCODE_ELSE));
1801
1802      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1803	 ir_instruction *ir = (ir_instruction *)iter.get();
1804	 this->base_ir = ir;
1805
1806	 ir->accept(this);
1807      }
1808   }
1809
1810   emit(fs_inst(BRW_OPCODE_ENDIF));
1811}
1812
1813void
1814fs_visitor::visit(ir_loop *ir)
1815{
1816   fs_reg counter = reg_undef;
1817
1818   if (ir->counter) {
1819      this->base_ir = ir->counter;
1820      ir->counter->accept(this);
1821      counter = *(variable_storage(ir->counter));
1822
1823      if (ir->from) {
1824	 this->base_ir = ir->from;
1825	 ir->from->accept(this);
1826
1827	 emit(fs_inst(BRW_OPCODE_MOV, counter, this->result));
1828      }
1829   }
1830
1831   emit(fs_inst(BRW_OPCODE_DO));
1832
1833   if (ir->to) {
1834      this->base_ir = ir->to;
1835      ir->to->accept(this);
1836
1837      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp,
1838				   counter, this->result));
1839      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1840
1841      inst = emit(fs_inst(BRW_OPCODE_BREAK));
1842      inst->predicated = true;
1843   }
1844
1845   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1846      ir_instruction *ir = (ir_instruction *)iter.get();
1847
1848      this->base_ir = ir;
1849      ir->accept(this);
1850   }
1851
1852   if (ir->increment) {
1853      this->base_ir = ir->increment;
1854      ir->increment->accept(this);
1855      emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result));
1856   }
1857
1858   emit(fs_inst(BRW_OPCODE_WHILE));
1859}
1860
1861void
1862fs_visitor::visit(ir_loop_jump *ir)
1863{
1864   switch (ir->mode) {
1865   case ir_loop_jump::jump_break:
1866      emit(fs_inst(BRW_OPCODE_BREAK));
1867      break;
1868   case ir_loop_jump::jump_continue:
1869      emit(fs_inst(BRW_OPCODE_CONTINUE));
1870      break;
1871   }
1872}
1873
1874void
1875fs_visitor::visit(ir_call *ir)
1876{
1877   assert(!"FINISHME");
1878}
1879
1880void
1881fs_visitor::visit(ir_return *ir)
1882{
1883   assert(!"FINISHME");
1884}
1885
1886void
1887fs_visitor::visit(ir_function *ir)
1888{
1889   /* Ignore function bodies other than main() -- we shouldn't see calls to
1890    * them since they should all be inlined before we get to ir_to_mesa.
1891    */
1892   if (strcmp(ir->name, "main") == 0) {
1893      const ir_function_signature *sig;
1894      exec_list empty;
1895
1896      sig = ir->matching_signature(&empty);
1897
1898      assert(sig);
1899
1900      foreach_iter(exec_list_iterator, iter, sig->body) {
1901	 ir_instruction *ir = (ir_instruction *)iter.get();
1902	 this->base_ir = ir;
1903
1904	 ir->accept(this);
1905      }
1906   }
1907}
1908
1909void
1910fs_visitor::visit(ir_function_signature *ir)
1911{
1912   assert(!"not reached");
1913   (void)ir;
1914}
1915
1916fs_inst *
1917fs_visitor::emit(fs_inst inst)
1918{
1919   fs_inst *list_inst = new(mem_ctx) fs_inst;
1920   *list_inst = inst;
1921
1922   list_inst->annotation = this->current_annotation;
1923   list_inst->ir = this->base_ir;
1924
1925   this->instructions.push_tail(list_inst);
1926
1927   return list_inst;
1928}
1929
1930/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1931void
1932fs_visitor::emit_dummy_fs()
1933{
1934   /* Everyone's favorite color. */
1935   emit(fs_inst(BRW_OPCODE_MOV,
1936		fs_reg(MRF, 2),
1937		fs_reg(1.0f)));
1938   emit(fs_inst(BRW_OPCODE_MOV,
1939		fs_reg(MRF, 3),
1940		fs_reg(0.0f)));
1941   emit(fs_inst(BRW_OPCODE_MOV,
1942		fs_reg(MRF, 4),
1943		fs_reg(1.0f)));
1944   emit(fs_inst(BRW_OPCODE_MOV,
1945		fs_reg(MRF, 5),
1946		fs_reg(0.0f)));
1947
1948   fs_inst *write;
1949   write = emit(fs_inst(FS_OPCODE_FB_WRITE,
1950			fs_reg(0),
1951			fs_reg(0)));
1952   write->base_mrf = 0;
1953}
1954
1955/* The register location here is relative to the start of the URB
1956 * data.  It will get adjusted to be a real location before
1957 * generate_code() time.
1958 */
1959struct brw_reg
1960fs_visitor::interp_reg(int location, int channel)
1961{
1962   int regnr = urb_setup[location] * 2 + channel / 2;
1963   int stride = (channel & 1) * 4;
1964
1965   assert(urb_setup[location] != -1);
1966
1967   return brw_vec1_grf(regnr, stride);
1968}
1969
1970/** Emits the interpolation for the varying inputs. */
1971void
1972fs_visitor::emit_interpolation_setup_gen4()
1973{
1974   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1975
1976   this->current_annotation = "compute pixel centers";
1977   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1978   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1979   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1980   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1981   emit(fs_inst(BRW_OPCODE_ADD,
1982		this->pixel_x,
1983		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1984		fs_reg(brw_imm_v(0x10101010))));
1985   emit(fs_inst(BRW_OPCODE_ADD,
1986		this->pixel_y,
1987		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1988		fs_reg(brw_imm_v(0x11001100))));
1989
1990   this->current_annotation = "compute pixel deltas from v0";
1991   if (brw->has_pln) {
1992      this->delta_x = fs_reg(this, glsl_type::vec2_type);
1993      this->delta_y = this->delta_x;
1994      this->delta_y.reg_offset++;
1995   } else {
1996      this->delta_x = fs_reg(this, glsl_type::float_type);
1997      this->delta_y = fs_reg(this, glsl_type::float_type);
1998   }
1999   emit(fs_inst(BRW_OPCODE_ADD,
2000		this->delta_x,
2001		this->pixel_x,
2002		fs_reg(negate(brw_vec1_grf(1, 0)))));
2003   emit(fs_inst(BRW_OPCODE_ADD,
2004		this->delta_y,
2005		this->pixel_y,
2006		fs_reg(negate(brw_vec1_grf(1, 1)))));
2007
2008   this->current_annotation = "compute pos.w and 1/pos.w";
2009   /* Compute wpos.w.  It's always in our setup, since it's needed to
2010    * interpolate the other attributes.
2011    */
2012   this->wpos_w = fs_reg(this, glsl_type::float_type);
2013   emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
2014		interp_reg(FRAG_ATTRIB_WPOS, 3)));
2015   /* Compute the pixel 1/W value from wpos.w. */
2016   this->pixel_w = fs_reg(this, glsl_type::float_type);
2017   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2018   this->current_annotation = NULL;
2019}
2020
2021/** Emits the interpolation for the varying inputs. */
2022void
2023fs_visitor::emit_interpolation_setup_gen6()
2024{
2025   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2026
2027   /* If the pixel centers end up used, the setup is the same as for gen4. */
2028   this->current_annotation = "compute pixel centers";
2029   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2030   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2031   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2032   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2033   emit(fs_inst(BRW_OPCODE_ADD,
2034		int_pixel_x,
2035		fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2036		fs_reg(brw_imm_v(0x10101010))));
2037   emit(fs_inst(BRW_OPCODE_ADD,
2038		int_pixel_y,
2039		fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2040		fs_reg(brw_imm_v(0x11001100))));
2041
2042   /* As of gen6, we can no longer mix float and int sources.  We have
2043    * to turn the integer pixel centers into floats for their actual
2044    * use.
2045    */
2046   this->pixel_x = fs_reg(this, glsl_type::float_type);
2047   this->pixel_y = fs_reg(this, glsl_type::float_type);
2048   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x));
2049   emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y));
2050
2051   this->current_annotation = "compute 1/pos.w";
2052   this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2053   this->pixel_w = fs_reg(this, glsl_type::float_type);
2054   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2055
2056   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2057   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2058
2059   this->current_annotation = NULL;
2060}
2061
2062void
2063fs_visitor::emit_fb_writes()
2064{
2065   this->current_annotation = "FB write header";
2066   GLboolean header_present = GL_TRUE;
2067   int nr = 0;
2068
2069   if (intel->gen >= 6 &&
2070       !this->kill_emitted &&
2071       c->key.nr_color_regions == 1) {
2072      header_present = false;
2073   }
2074
2075   if (header_present) {
2076      /* m0, m1 header */
2077      nr += 2;
2078   }
2079
2080   if (c->aa_dest_stencil_reg) {
2081      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2082		   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
2083   }
2084
2085   /* Reserve space for color. It'll be filled in per MRT below. */
2086   int color_mrf = nr;
2087   nr += 4;
2088
2089   if (c->source_depth_to_render_target) {
2090      if (c->computes_depth) {
2091	 /* Hand over gl_FragDepth. */
2092	 assert(this->frag_depth);
2093	 fs_reg depth = *(variable_storage(this->frag_depth));
2094
2095	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth));
2096      } else {
2097	 /* Pass through the payload depth. */
2098	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2099		      fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
2100      }
2101   }
2102
2103   if (c->dest_depth_reg) {
2104      emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2105		   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
2106   }
2107
2108   fs_reg color = reg_undef;
2109   if (this->frag_color)
2110      color = *(variable_storage(this->frag_color));
2111   else if (this->frag_data) {
2112      color = *(variable_storage(this->frag_data));
2113      color.type = BRW_REGISTER_TYPE_F;
2114   }
2115
2116   for (int target = 0; target < c->key.nr_color_regions; target++) {
2117      this->current_annotation = ralloc_asprintf(this->mem_ctx,
2118						 "FB write target %d",
2119						 target);
2120      if (this->frag_color || this->frag_data) {
2121	 for (int i = 0; i < 4; i++) {
2122	    emit(fs_inst(BRW_OPCODE_MOV,
2123			 fs_reg(MRF, color_mrf + i),
2124			 color));
2125	    color.reg_offset++;
2126	 }
2127      }
2128
2129      if (this->frag_color)
2130	 color.reg_offset -= 4;
2131
2132      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
2133				   reg_undef, reg_undef));
2134      inst->target = target;
2135      inst->base_mrf = 0;
2136      inst->mlen = nr;
2137      if (target == c->key.nr_color_regions - 1)
2138	 inst->eot = true;
2139      inst->header_present = header_present;
2140   }
2141
2142   if (c->key.nr_color_regions == 0) {
2143      if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2144	 /* If the alpha test is enabled but there's no color buffer,
2145	  * we still need to send alpha out the pipeline to our null
2146	  * renderbuffer.
2147	  */
2148	 color.reg_offset += 3;
2149	 emit(fs_inst(BRW_OPCODE_MOV,
2150		      fs_reg(MRF, color_mrf + 3),
2151		      color));
2152      }
2153
2154      fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
2155				   reg_undef, reg_undef));
2156      inst->base_mrf = 0;
2157      inst->mlen = nr;
2158      inst->eot = true;
2159      inst->header_present = header_present;
2160   }
2161
2162   this->current_annotation = NULL;
2163}
2164
2165void
2166fs_visitor::generate_fb_write(fs_inst *inst)
2167{
2168   GLboolean eot = inst->eot;
2169   struct brw_reg implied_header;
2170
2171   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2172    * move, here's g1.
2173    */
2174   brw_push_insn_state(p);
2175   brw_set_mask_control(p, BRW_MASK_DISABLE);
2176   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2177
2178   if (inst->header_present) {
2179      if (intel->gen >= 6) {
2180	 brw_MOV(p,
2181		 brw_message_reg(inst->base_mrf),
2182		 brw_vec8_grf(0, 0));
2183
2184	 if (inst->target > 0) {
2185	    /* Set the render target index for choosing BLEND_STATE. */
2186	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2187			      BRW_REGISTER_TYPE_UD),
2188		    brw_imm_ud(inst->target));
2189	 }
2190
2191	 /* Clear viewport index, render target array index. */
2192	 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2193			   BRW_REGISTER_TYPE_UD),
2194		 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2195		 brw_imm_ud(0xf7ff));
2196
2197	 implied_header = brw_null_reg();
2198      } else {
2199	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2200      }
2201
2202      brw_MOV(p,
2203	      brw_message_reg(inst->base_mrf + 1),
2204	      brw_vec8_grf(1, 0));
2205   } else {
2206      implied_header = brw_null_reg();
2207   }
2208
2209   brw_pop_insn_state(p);
2210
2211   brw_fb_WRITE(p,
2212		8, /* dispatch_width */
2213		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
2214		inst->base_mrf,
2215		implied_header,
2216		inst->target,
2217		inst->mlen,
2218		0,
2219		eot,
2220		inst->header_present);
2221}
2222
2223void
2224fs_visitor::generate_linterp(fs_inst *inst,
2225			     struct brw_reg dst, struct brw_reg *src)
2226{
2227   struct brw_reg delta_x = src[0];
2228   struct brw_reg delta_y = src[1];
2229   struct brw_reg interp = src[2];
2230
2231   if (brw->has_pln &&
2232       delta_y.nr == delta_x.nr + 1 &&
2233       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2234      brw_PLN(p, dst, interp, delta_x);
2235   } else {
2236      brw_LINE(p, brw_null_reg(), interp, delta_x);
2237      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2238   }
2239}
2240
2241void
2242fs_visitor::generate_math(fs_inst *inst,
2243			  struct brw_reg dst, struct brw_reg *src)
2244{
2245   int op;
2246
2247   switch (inst->opcode) {
2248   case FS_OPCODE_RCP:
2249      op = BRW_MATH_FUNCTION_INV;
2250      break;
2251   case FS_OPCODE_RSQ:
2252      op = BRW_MATH_FUNCTION_RSQ;
2253      break;
2254   case FS_OPCODE_SQRT:
2255      op = BRW_MATH_FUNCTION_SQRT;
2256      break;
2257   case FS_OPCODE_EXP2:
2258      op = BRW_MATH_FUNCTION_EXP;
2259      break;
2260   case FS_OPCODE_LOG2:
2261      op = BRW_MATH_FUNCTION_LOG;
2262      break;
2263   case FS_OPCODE_POW:
2264      op = BRW_MATH_FUNCTION_POW;
2265      break;
2266   case FS_OPCODE_SIN:
2267      op = BRW_MATH_FUNCTION_SIN;
2268      break;
2269   case FS_OPCODE_COS:
2270      op = BRW_MATH_FUNCTION_COS;
2271      break;
2272   default:
2273      assert(!"not reached: unknown math function");
2274      op = 0;
2275      break;
2276   }
2277
2278   if (intel->gen >= 6) {
2279      assert(inst->mlen == 0);
2280
2281      if (inst->opcode == FS_OPCODE_POW) {
2282	 brw_math2(p, dst, op, src[0], src[1]);
2283      } else {
2284	 brw_math(p, dst,
2285		  op,
2286		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2287		  BRW_MATH_SATURATE_NONE,
2288		  0, src[0],
2289		  BRW_MATH_DATA_VECTOR,
2290		  BRW_MATH_PRECISION_FULL);
2291      }
2292   } else {
2293      assert(inst->mlen >= 1);
2294
2295      brw_math(p, dst,
2296	       op,
2297	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2298	       BRW_MATH_SATURATE_NONE,
2299	       inst->base_mrf, src[0],
2300	       BRW_MATH_DATA_VECTOR,
2301	       BRW_MATH_PRECISION_FULL);
2302   }
2303}
2304
2305void
2306fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2307{
2308   int msg_type = -1;
2309   int rlen = 4;
2310   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2311
2312   if (intel->gen >= 5) {
2313      switch (inst->opcode) {
2314      case FS_OPCODE_TEX:
2315	 if (inst->shadow_compare) {
2316	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
2317	 } else {
2318	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
2319	 }
2320	 break;
2321      case FS_OPCODE_TXB:
2322	 if (inst->shadow_compare) {
2323	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
2324	 } else {
2325	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
2326	 }
2327	 break;
2328      case FS_OPCODE_TXL:
2329	 if (inst->shadow_compare) {
2330	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
2331	 } else {
2332	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
2333	 }
2334	 break;
2335      case FS_OPCODE_TXD:
2336	 assert(!"TXD isn't supported on gen5+ yet.");
2337	 break;
2338      }
2339   } else {
2340      switch (inst->opcode) {
2341      case FS_OPCODE_TEX:
2342	 /* Note that G45 and older determines shadow compare and dispatch width
2343	  * from message length for most messages.
2344	  */
2345	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2346	 if (inst->shadow_compare) {
2347	    assert(inst->mlen == 6);
2348	 } else {
2349	    assert(inst->mlen <= 4);
2350	 }
2351	 break;
2352      case FS_OPCODE_TXB:
2353	 if (inst->shadow_compare) {
2354	    assert(inst->mlen == 6);
2355	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
2356	 } else {
2357	    assert(inst->mlen == 9);
2358	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2359	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2360	 }
2361	 break;
2362      case FS_OPCODE_TXL:
2363	 if (inst->shadow_compare) {
2364	    assert(inst->mlen == 6);
2365	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
2366	 } else {
2367	    assert(inst->mlen == 9);
2368	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
2369	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2370	 }
2371	 break;
2372      case FS_OPCODE_TXD:
2373	 assert(!"TXD isn't supported on gen4 yet.");
2374	 break;
2375      }
2376   }
2377   assert(msg_type != -1);
2378
2379   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2380      rlen = 8;
2381      dst = vec16(dst);
2382   }
2383
2384   brw_SAMPLE(p,
2385	      retype(dst, BRW_REGISTER_TYPE_UW),
2386	      inst->base_mrf,
2387	      src,
2388              SURF_INDEX_TEXTURE(inst->sampler),
2389	      inst->sampler,
2390	      WRITEMASK_XYZW,
2391	      msg_type,
2392	      rlen,
2393	      inst->mlen,
2394	      0,
2395	      1,
2396	      simd_mode);
2397}
2398
2399
2400/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2401 * looking like:
2402 *
2403 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2404 *
2405 * and we're trying to produce:
2406 *
2407 *           DDX                     DDY
2408 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2409 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2410 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2411 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2412 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2413 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2414 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2415 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2416 *
2417 * and add another set of two more subspans if in 16-pixel dispatch mode.
2418 *
2419 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2420 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2421 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2422 * between each other.  We could probably do it like ddx and swizzle the right
2423 * order later, but bail for now and just produce
2424 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2425 */
2426void
2427fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2428{
2429   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2430				 BRW_REGISTER_TYPE_F,
2431				 BRW_VERTICAL_STRIDE_2,
2432				 BRW_WIDTH_2,
2433				 BRW_HORIZONTAL_STRIDE_0,
2434				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2435   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2436				 BRW_REGISTER_TYPE_F,
2437				 BRW_VERTICAL_STRIDE_2,
2438				 BRW_WIDTH_2,
2439				 BRW_HORIZONTAL_STRIDE_0,
2440				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2441   brw_ADD(p, dst, src0, negate(src1));
2442}
2443
2444void
2445fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2446{
2447   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2448				 BRW_REGISTER_TYPE_F,
2449				 BRW_VERTICAL_STRIDE_4,
2450				 BRW_WIDTH_4,
2451				 BRW_HORIZONTAL_STRIDE_0,
2452				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2453   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2454				 BRW_REGISTER_TYPE_F,
2455				 BRW_VERTICAL_STRIDE_4,
2456				 BRW_WIDTH_4,
2457				 BRW_HORIZONTAL_STRIDE_0,
2458				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2459   brw_ADD(p, dst, src0, negate(src1));
2460}
2461
2462void
2463fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2464{
2465   if (intel->gen >= 6) {
2466      /* Gen6 no longer has the mask reg for us to just read the
2467       * active channels from.  However, cmp updates just the channels
2468       * of the flag reg that are enabled, so we can get at the
2469       * channel enables that way.  In this step, make a reg of ones
2470       * we'll compare to.
2471       */
2472      brw_MOV(p, mask, brw_imm_ud(1));
2473   } else {
2474      brw_push_insn_state(p);
2475      brw_set_mask_control(p, BRW_MASK_DISABLE);
2476      brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2477      brw_pop_insn_state(p);
2478   }
2479}
2480
2481void
2482fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2483{
2484   if (intel->gen >= 6) {
2485      struct brw_reg f0 = brw_flag_reg();
2486      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2487
2488      brw_push_insn_state(p);
2489      brw_set_mask_control(p, BRW_MASK_DISABLE);
2490      brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2491      brw_pop_insn_state(p);
2492
2493      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2494	      BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2495      /* Undo CMP's whacking of predication*/
2496      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2497
2498      brw_push_insn_state(p);
2499      brw_set_mask_control(p, BRW_MASK_DISABLE);
2500      brw_AND(p, g1, f0, g1);
2501      brw_pop_insn_state(p);
2502   } else {
2503      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2504
2505      mask = brw_uw1_reg(mask.file, mask.nr, 0);
2506
2507      brw_push_insn_state(p);
2508      brw_set_mask_control(p, BRW_MASK_DISABLE);
2509      brw_AND(p, g0, mask, g0);
2510      brw_pop_insn_state(p);
2511   }
2512}
2513
2514void
2515fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2516{
2517   assert(inst->mlen != 0);
2518
2519   brw_MOV(p,
2520	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2521	   retype(src, BRW_REGISTER_TYPE_UD));
2522   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2523				 inst->offset);
2524}
2525
2526void
2527fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2528{
2529   assert(inst->mlen != 0);
2530
2531   /* Clear any post destination dependencies that would be ignored by
2532    * the block read.  See the B-Spec for pre-gen5 send instruction.
2533    *
2534    * This could use a better solution, since texture sampling and
2535    * math reads could potentially run into it as well -- anywhere
2536    * that we have a SEND with a destination that is a register that
2537    * was written but not read within the last N instructions (what's
2538    * N?  unsure).  This is rare because of dead code elimination, but
2539    * not impossible.
2540    */
2541   if (intel->gen == 4 && !intel->is_g4x)
2542      brw_MOV(p, brw_null_reg(), dst);
2543
2544   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2545				inst->offset);
2546
2547   if (intel->gen == 4 && !intel->is_g4x) {
2548      /* gen4 errata: destination from a send can't be used as a
2549       * destination until it's been read.  Just read it so we don't
2550       * have to worry.
2551       */
2552      brw_MOV(p, brw_null_reg(), dst);
2553   }
2554}
2555
2556
2557void
2558fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2559{
2560   assert(inst->mlen != 0);
2561
2562   /* Clear any post destination dependencies that would be ignored by
2563    * the block read.  See the B-Spec for pre-gen5 send instruction.
2564    *
2565    * This could use a better solution, since texture sampling and
2566    * math reads could potentially run into it as well -- anywhere
2567    * that we have a SEND with a destination that is a register that
2568    * was written but not read within the last N instructions (what's
2569    * N?  unsure).  This is rare because of dead code elimination, but
2570    * not impossible.
2571    */
2572   if (intel->gen == 4 && !intel->is_g4x)
2573      brw_MOV(p, brw_null_reg(), dst);
2574
2575   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2576			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2577
2578   if (intel->gen == 4 && !intel->is_g4x) {
2579      /* gen4 errata: destination from a send can't be used as a
2580       * destination until it's been read.  Just read it so we don't
2581       * have to worry.
2582       */
2583      brw_MOV(p, brw_null_reg(), dst);
2584   }
2585}
2586
2587/**
2588 * To be called after the last _mesa_add_state_reference() call, to
2589 * set up prog_data.param[] for assign_curb_setup() and
2590 * setup_pull_constants().
2591 */
2592void
2593fs_visitor::setup_paramvalues_refs()
2594{
2595   /* Set up the pointers to ParamValues now that that array is finalized. */
2596   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2597      c->prog_data.param[i] =
2598	 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2599	 this->param_offset[i];
2600   }
2601}
2602
2603void
2604fs_visitor::assign_curb_setup()
2605{
2606   c->prog_data.first_curbe_grf = c->nr_payload_regs;
2607   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2608
2609   /* Map the offsets in the UNIFORM file to fixed HW regs. */
2610   foreach_iter(exec_list_iterator, iter, this->instructions) {
2611      fs_inst *inst = (fs_inst *)iter.get();
2612
2613      for (unsigned int i = 0; i < 3; i++) {
2614	 if (inst->src[i].file == UNIFORM) {
2615	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2616	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2617						  constant_nr / 8,
2618						  constant_nr % 8);
2619
2620	    inst->src[i].file = FIXED_HW_REG;
2621	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2622	 }
2623      }
2624   }
2625}
2626
2627void
2628fs_visitor::calculate_urb_setup()
2629{
2630   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2631      urb_setup[i] = -1;
2632   }
2633
2634   int urb_next = 0;
2635   /* Figure out where each of the incoming setup attributes lands. */
2636   if (intel->gen >= 6) {
2637      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2638	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2639	    urb_setup[i] = urb_next++;
2640	 }
2641      }
2642   } else {
2643      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2644      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2645	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2646	    int fp_index;
2647
2648	    if (i >= VERT_RESULT_VAR0)
2649	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2650	    else if (i <= VERT_RESULT_TEX7)
2651	       fp_index = i;
2652	    else
2653	       fp_index = -1;
2654
2655	    if (fp_index >= 0)
2656	       urb_setup[fp_index] = urb_next++;
2657	 }
2658      }
2659   }
2660
2661   /* Each attribute is 4 setup channels, each of which is half a reg. */
2662   c->prog_data.urb_read_length = urb_next * 2;
2663}
2664
2665void
2666fs_visitor::assign_urb_setup()
2667{
2668   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2669
2670   /* Offset all the urb_setup[] index by the actual position of the
2671    * setup regs, now that the location of the constants has been chosen.
2672    */
2673   foreach_iter(exec_list_iterator, iter, this->instructions) {
2674      fs_inst *inst = (fs_inst *)iter.get();
2675
2676      if (inst->opcode == FS_OPCODE_LINTERP) {
2677	 assert(inst->src[2].file == FIXED_HW_REG);
2678	 inst->src[2].fixed_hw_reg.nr += urb_start;
2679      }
2680
2681      if (inst->opcode == FS_OPCODE_CINTERP) {
2682	 assert(inst->src[0].file == FIXED_HW_REG);
2683	 inst->src[0].fixed_hw_reg.nr += urb_start;
2684      }
2685   }
2686
2687   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2688}
2689
2690/**
2691 * Split large virtual GRFs into separate components if we can.
2692 *
2693 * This is mostly duplicated with what brw_fs_vector_splitting does,
2694 * but that's really conservative because it's afraid of doing
2695 * splitting that doesn't result in real progress after the rest of
2696 * the optimization phases, which would cause infinite looping in
2697 * optimization.  We can do it once here, safely.  This also has the
2698 * opportunity to split interpolated values, or maybe even uniforms,
2699 * which we don't have at the IR level.
2700 *
2701 * We want to split, because virtual GRFs are what we register
2702 * allocate and spill (due to contiguousness requirements for some
2703 * instructions), and they're what we naturally generate in the
2704 * codegen process, but most virtual GRFs don't actually need to be
2705 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2706 * live intervals and better dead code elimination and coalescing.
2707 */
2708void
2709fs_visitor::split_virtual_grfs()
2710{
2711   int num_vars = this->virtual_grf_next;
2712   bool split_grf[num_vars];
2713   int new_virtual_grf[num_vars];
2714
2715   /* Try to split anything > 0 sized. */
2716   for (int i = 0; i < num_vars; i++) {
2717      if (this->virtual_grf_sizes[i] != 1)
2718	 split_grf[i] = true;
2719      else
2720	 split_grf[i] = false;
2721   }
2722
2723   if (brw->has_pln) {
2724      /* PLN opcodes rely on the delta_xy being contiguous. */
2725      split_grf[this->delta_x.reg] = false;
2726   }
2727
2728   foreach_iter(exec_list_iterator, iter, this->instructions) {
2729      fs_inst *inst = (fs_inst *)iter.get();
2730
2731      /* Texturing produces 4 contiguous registers, so no splitting. */
2732      if (inst->is_tex()) {
2733	 split_grf[inst->dst.reg] = false;
2734      }
2735   }
2736
2737   /* Allocate new space for split regs.  Note that the virtual
2738    * numbers will be contiguous.
2739    */
2740   for (int i = 0; i < num_vars; i++) {
2741      if (split_grf[i]) {
2742	 new_virtual_grf[i] = virtual_grf_alloc(1);
2743	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2744	    int reg = virtual_grf_alloc(1);
2745	    assert(reg == new_virtual_grf[i] + j - 1);
2746	    (void) reg;
2747	 }
2748	 this->virtual_grf_sizes[i] = 1;
2749      }
2750   }
2751
2752   foreach_iter(exec_list_iterator, iter, this->instructions) {
2753      fs_inst *inst = (fs_inst *)iter.get();
2754
2755      if (inst->dst.file == GRF &&
2756	  split_grf[inst->dst.reg] &&
2757	  inst->dst.reg_offset != 0) {
2758	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2759			  inst->dst.reg_offset - 1);
2760	 inst->dst.reg_offset = 0;
2761      }
2762      for (int i = 0; i < 3; i++) {
2763	 if (inst->src[i].file == GRF &&
2764	     split_grf[inst->src[i].reg] &&
2765	     inst->src[i].reg_offset != 0) {
2766	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2767				inst->src[i].reg_offset - 1);
2768	    inst->src[i].reg_offset = 0;
2769	 }
2770      }
2771   }
2772   this->live_intervals_valid = false;
2773}
2774
2775/**
2776 * Choose accesses from the UNIFORM file to demote to using the pull
2777 * constant buffer.
2778 *
2779 * We allow a fragment shader to have more than the specified minimum
2780 * maximum number of fragment shader uniform components (64).  If
2781 * there are too many of these, they'd fill up all of register space.
2782 * So, this will push some of them out to the pull constant buffer and
2783 * update the program to load them.
2784 */
2785void
2786fs_visitor::setup_pull_constants()
2787{
2788   /* Only allow 16 registers (128 uniform components) as push constants. */
2789   unsigned int max_uniform_components = 16 * 8;
2790   if (c->prog_data.nr_params <= max_uniform_components)
2791      return;
2792
2793   /* Just demote the end of the list.  We could probably do better
2794    * here, demoting things that are rarely used in the program first.
2795    */
2796   int pull_uniform_base = max_uniform_components;
2797   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2798
2799   foreach_iter(exec_list_iterator, iter, this->instructions) {
2800      fs_inst *inst = (fs_inst *)iter.get();
2801
2802      for (int i = 0; i < 3; i++) {
2803	 if (inst->src[i].file != UNIFORM)
2804	    continue;
2805
2806	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2807	 if (uniform_nr < pull_uniform_base)
2808	    continue;
2809
2810	 fs_reg dst = fs_reg(this, glsl_type::float_type);
2811	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2812					      dst);
2813	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2814	 pull->ir = inst->ir;
2815	 pull->annotation = inst->annotation;
2816	 pull->base_mrf = 14;
2817	 pull->mlen = 1;
2818
2819	 inst->insert_before(pull);
2820
2821	 inst->src[i].file = GRF;
2822	 inst->src[i].reg = dst.reg;
2823	 inst->src[i].reg_offset = 0;
2824	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2825      }
2826   }
2827
2828   for (int i = 0; i < pull_uniform_count; i++) {
2829      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2830      c->prog_data.pull_param_convert[i] =
2831	 c->prog_data.param_convert[pull_uniform_base + i];
2832   }
2833   c->prog_data.nr_params -= pull_uniform_count;
2834   c->prog_data.nr_pull_params = pull_uniform_count;
2835}
2836
2837void
2838fs_visitor::calculate_live_intervals()
2839{
2840   int num_vars = this->virtual_grf_next;
2841   int *def = ralloc_array(mem_ctx, int, num_vars);
2842   int *use = ralloc_array(mem_ctx, int, num_vars);
2843   int loop_depth = 0;
2844   int loop_start = 0;
2845   int bb_header_ip = 0;
2846
2847   if (this->live_intervals_valid)
2848      return;
2849
2850   for (int i = 0; i < num_vars; i++) {
2851      def[i] = MAX_INSTRUCTION;
2852      use[i] = -1;
2853   }
2854
2855   int ip = 0;
2856   foreach_iter(exec_list_iterator, iter, this->instructions) {
2857      fs_inst *inst = (fs_inst *)iter.get();
2858
2859      if (inst->opcode == BRW_OPCODE_DO) {
2860	 if (loop_depth++ == 0)
2861	    loop_start = ip;
2862      } else if (inst->opcode == BRW_OPCODE_WHILE) {
2863	 loop_depth--;
2864
2865	 if (loop_depth == 0) {
2866	    /* Patches up the use of vars marked for being live across
2867	     * the whole loop.
2868	     */
2869	    for (int i = 0; i < num_vars; i++) {
2870	       if (use[i] == loop_start) {
2871		  use[i] = ip;
2872	       }
2873	    }
2874	 }
2875      } else {
2876	 for (unsigned int i = 0; i < 3; i++) {
2877	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2878	       int reg = inst->src[i].reg;
2879
2880	       if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2881				   def[reg] >= bb_header_ip)) {
2882		  use[reg] = ip;
2883	       } else {
2884		  def[reg] = MIN2(loop_start, def[reg]);
2885		  use[reg] = loop_start;
2886
2887		  /* Nobody else is going to go smash our start to
2888		   * later in the loop now, because def[reg] now
2889		   * points before the bb header.
2890		   */
2891	       }
2892	    }
2893	 }
2894	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2895	    int reg = inst->dst.reg;
2896
2897	    if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2898				!inst->predicated)) {
2899	       def[reg] = MIN2(def[reg], ip);
2900	    } else {
2901	       def[reg] = MIN2(def[reg], loop_start);
2902	    }
2903	 }
2904      }
2905
2906      ip++;
2907
2908      /* Set the basic block header IP.  This is used for determining
2909       * if a complete def of single-register virtual GRF in a loop
2910       * dominates a use in the same basic block.  It's a quick way to
2911       * reduce the live interval range of most register used in a
2912       * loop.
2913       */
2914      if (inst->opcode == BRW_OPCODE_IF ||
2915	  inst->opcode == BRW_OPCODE_ELSE ||
2916	  inst->opcode == BRW_OPCODE_ENDIF ||
2917	  inst->opcode == BRW_OPCODE_DO ||
2918	  inst->opcode == BRW_OPCODE_WHILE ||
2919	  inst->opcode == BRW_OPCODE_BREAK ||
2920	  inst->opcode == BRW_OPCODE_CONTINUE) {
2921	 bb_header_ip = ip;
2922      }
2923   }
2924
2925   ralloc_free(this->virtual_grf_def);
2926   ralloc_free(this->virtual_grf_use);
2927   this->virtual_grf_def = def;
2928   this->virtual_grf_use = use;
2929
2930   this->live_intervals_valid = true;
2931}
2932
2933/**
2934 * Attempts to move immediate constants into the immediate
2935 * constant slot of following instructions.
2936 *
2937 * Immediate constants are a bit tricky -- they have to be in the last
2938 * operand slot, you can't do abs/negate on them,
2939 */
2940
2941bool
2942fs_visitor::propagate_constants()
2943{
2944   bool progress = false;
2945
2946   calculate_live_intervals();
2947
2948   foreach_iter(exec_list_iterator, iter, this->instructions) {
2949      fs_inst *inst = (fs_inst *)iter.get();
2950
2951      if (inst->opcode != BRW_OPCODE_MOV ||
2952	  inst->predicated ||
2953	  inst->dst.file != GRF || inst->src[0].file != IMM ||
2954	  inst->dst.type != inst->src[0].type)
2955	 continue;
2956
2957      /* Don't bother with cases where we should have had the
2958       * operation on the constant folded in GLSL already.
2959       */
2960      if (inst->saturate)
2961	 continue;
2962
2963      /* Found a move of a constant to a GRF.  Find anything else using the GRF
2964       * before it's written, and replace it with the constant if we can.
2965       */
2966      exec_list_iterator scan_iter = iter;
2967      scan_iter.next();
2968      for (; scan_iter.has_next(); scan_iter.next()) {
2969	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2970
2971	 if (scan_inst->opcode == BRW_OPCODE_DO ||
2972	     scan_inst->opcode == BRW_OPCODE_WHILE ||
2973	     scan_inst->opcode == BRW_OPCODE_ELSE ||
2974	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
2975	    break;
2976	 }
2977
2978	 for (int i = 2; i >= 0; i--) {
2979	    if (scan_inst->src[i].file != GRF ||
2980		scan_inst->src[i].reg != inst->dst.reg ||
2981		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2982	       continue;
2983
2984	    /* Don't bother with cases where we should have had the
2985	     * operation on the constant folded in GLSL already.
2986	     */
2987	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2988	       continue;
2989
2990	    switch (scan_inst->opcode) {
2991	    case BRW_OPCODE_MOV:
2992	       scan_inst->src[i] = inst->src[0];
2993	       progress = true;
2994	       break;
2995
2996	    case BRW_OPCODE_MUL:
2997	    case BRW_OPCODE_ADD:
2998	       if (i == 1) {
2999		  scan_inst->src[i] = inst->src[0];
3000		  progress = true;
3001	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
3002		  /* Fit this constant in by commuting the operands */
3003		  scan_inst->src[0] = scan_inst->src[1];
3004		  scan_inst->src[1] = inst->src[0];
3005		  progress = true;
3006	       }
3007	       break;
3008	    case BRW_OPCODE_CMP:
3009	    case BRW_OPCODE_SEL:
3010	       if (i == 1) {
3011		  scan_inst->src[i] = inst->src[0];
3012		  progress = true;
3013	       }
3014	    }
3015	 }
3016
3017	 if (scan_inst->dst.file == GRF &&
3018	     scan_inst->dst.reg == inst->dst.reg &&
3019	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3020	      scan_inst->is_tex())) {
3021	    break;
3022	 }
3023      }
3024   }
3025
3026   if (progress)
3027       this->live_intervals_valid = false;
3028
3029   return progress;
3030}
3031/**
3032 * Must be called after calculate_live_intervales() to remove unused
3033 * writes to registers -- register allocation will fail otherwise
3034 * because something deffed but not used won't be considered to
3035 * interfere with other regs.
3036 */
3037bool
3038fs_visitor::dead_code_eliminate()
3039{
3040   bool progress = false;
3041   int pc = 0;
3042
3043   calculate_live_intervals();
3044
3045   foreach_iter(exec_list_iterator, iter, this->instructions) {
3046      fs_inst *inst = (fs_inst *)iter.get();
3047
3048      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
3049	 inst->remove();
3050	 progress = true;
3051      }
3052
3053      pc++;
3054   }
3055
3056   if (progress)
3057      live_intervals_valid = false;
3058
3059   return progress;
3060}
3061
3062bool
3063fs_visitor::register_coalesce()
3064{
3065   bool progress = false;
3066   int if_depth = 0;
3067   int loop_depth = 0;
3068
3069   foreach_iter(exec_list_iterator, iter, this->instructions) {
3070      fs_inst *inst = (fs_inst *)iter.get();
3071
3072      /* Make sure that we dominate the instructions we're going to
3073       * scan for interfering with our coalescing, or we won't have
3074       * scanned enough to see if anything interferes with our
3075       * coalescing.  We don't dominate the following instructions if
3076       * we're in a loop or an if block.
3077       */
3078      switch (inst->opcode) {
3079      case BRW_OPCODE_DO:
3080	 loop_depth++;
3081	 break;
3082      case BRW_OPCODE_WHILE:
3083	 loop_depth--;
3084	 break;
3085      case BRW_OPCODE_IF:
3086	 if_depth++;
3087	 break;
3088      case BRW_OPCODE_ENDIF:
3089	 if_depth--;
3090	 break;
3091      }
3092      if (loop_depth || if_depth)
3093	 continue;
3094
3095      if (inst->opcode != BRW_OPCODE_MOV ||
3096	  inst->predicated ||
3097	  inst->saturate ||
3098	  inst->dst.file != GRF || inst->src[0].file != GRF ||
3099	  inst->dst.type != inst->src[0].type)
3100	 continue;
3101
3102      bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
3103
3104      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
3105       * them: check for no writes to either one until the exit of the
3106       * program.
3107       */
3108      bool interfered = false;
3109      exec_list_iterator scan_iter = iter;
3110      scan_iter.next();
3111      for (; scan_iter.has_next(); scan_iter.next()) {
3112	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3113
3114	 if (scan_inst->dst.file == GRF) {
3115	    if (scan_inst->dst.reg == inst->dst.reg &&
3116		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3117		 scan_inst->is_tex())) {
3118	       interfered = true;
3119	       break;
3120	    }
3121	    if (scan_inst->dst.reg == inst->src[0].reg &&
3122		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3123		 scan_inst->is_tex())) {
3124	       interfered = true;
3125	       break;
3126	    }
3127	 }
3128
3129	 /* The gen6 MATH instruction can't handle source modifiers, so avoid
3130	  * coalescing those for now.  We should do something more specific.
3131	  */
3132	 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) {
3133	    interfered = true;
3134	    break;
3135	 }
3136      }
3137      if (interfered) {
3138	 continue;
3139      }
3140
3141      /* Rewrite the later usage to point at the source of the move to
3142       * be removed.
3143       */
3144      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3145	   scan_iter.next()) {
3146	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3147
3148	 for (int i = 0; i < 3; i++) {
3149	    if (scan_inst->src[i].file == GRF &&
3150		scan_inst->src[i].reg == inst->dst.reg &&
3151		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3152	       scan_inst->src[i].reg = inst->src[0].reg;
3153	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3154	       scan_inst->src[i].abs |= inst->src[0].abs;
3155	       scan_inst->src[i].negate ^= inst->src[0].negate;
3156	       scan_inst->src[i].smear = inst->src[0].smear;
3157	    }
3158	 }
3159      }
3160
3161      inst->remove();
3162      progress = true;
3163   }
3164
3165   if (progress)
3166      live_intervals_valid = false;
3167
3168   return progress;
3169}
3170
3171
3172bool
3173fs_visitor::compute_to_mrf()
3174{
3175   bool progress = false;
3176   int next_ip = 0;
3177
3178   calculate_live_intervals();
3179
3180   foreach_iter(exec_list_iterator, iter, this->instructions) {
3181      fs_inst *inst = (fs_inst *)iter.get();
3182
3183      int ip = next_ip;
3184      next_ip++;
3185
3186      if (inst->opcode != BRW_OPCODE_MOV ||
3187	  inst->predicated ||
3188	  inst->dst.file != MRF || inst->src[0].file != GRF ||
3189	  inst->dst.type != inst->src[0].type ||
3190	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3191	 continue;
3192
3193      /* Can't compute-to-MRF this GRF if someone else was going to
3194       * read it later.
3195       */
3196      if (this->virtual_grf_use[inst->src[0].reg] > ip)
3197	 continue;
3198
3199      /* Found a move of a GRF to a MRF.  Let's see if we can go
3200       * rewrite the thing that made this GRF to write into the MRF.
3201       */
3202      fs_inst *scan_inst;
3203      for (scan_inst = (fs_inst *)inst->prev;
3204	   scan_inst->prev != NULL;
3205	   scan_inst = (fs_inst *)scan_inst->prev) {
3206	 if (scan_inst->dst.file == GRF &&
3207	     scan_inst->dst.reg == inst->src[0].reg) {
3208	    /* Found the last thing to write our reg we want to turn
3209	     * into a compute-to-MRF.
3210	     */
3211
3212	    if (scan_inst->is_tex()) {
3213	       /* texturing writes several continuous regs, so we can't
3214		* compute-to-mrf that.
3215		*/
3216	       break;
3217	    }
3218
3219	    /* If it's predicated, it (probably) didn't populate all
3220	     * the channels.
3221	     */
3222	    if (scan_inst->predicated)
3223	       break;
3224
3225	    /* SEND instructions can't have MRF as a destination. */
3226	    if (scan_inst->mlen)
3227	       break;
3228
3229	    if (intel->gen >= 6) {
3230	       /* gen6 math instructions must have the destination be
3231		* GRF, so no compute-to-MRF for them.
3232		*/
3233	       if (scan_inst->is_math()) {
3234		  break;
3235	       }
3236	    }
3237
3238	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3239	       /* Found the creator of our MRF's source value. */
3240	       scan_inst->dst.file = MRF;
3241	       scan_inst->dst.hw_reg = inst->dst.hw_reg;
3242	       scan_inst->saturate |= inst->saturate;
3243	       inst->remove();
3244	       progress = true;
3245	    }
3246	    break;
3247	 }
3248
3249	 /* We don't handle flow control here.  Most computation of
3250	  * values that end up in MRFs are shortly before the MRF
3251	  * write anyway.
3252	  */
3253	 if (scan_inst->opcode == BRW_OPCODE_DO ||
3254	     scan_inst->opcode == BRW_OPCODE_WHILE ||
3255	     scan_inst->opcode == BRW_OPCODE_ELSE ||
3256	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
3257	    break;
3258	 }
3259
3260	 /* You can't read from an MRF, so if someone else reads our
3261	  * MRF's source GRF that we wanted to rewrite, that stops us.
3262	  */
3263	 bool interfered = false;
3264	 for (int i = 0; i < 3; i++) {
3265	    if (scan_inst->src[i].file == GRF &&
3266		scan_inst->src[i].reg == inst->src[0].reg &&
3267		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3268	       interfered = true;
3269	    }
3270	 }
3271	 if (interfered)
3272	    break;
3273
3274	 if (scan_inst->dst.file == MRF &&
3275	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3276	    /* Somebody else wrote our MRF here, so we can't can't
3277	     * compute-to-MRF before that.
3278	     */
3279	    break;
3280	 }
3281
3282	 if (scan_inst->mlen > 0) {
3283	    /* Found a SEND instruction, which means that there are
3284	     * live values in MRFs from base_mrf to base_mrf +
3285	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3286	     * above it.
3287	     */
3288	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3289		inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3290	       break;
3291	    }
3292	 }
3293      }
3294   }
3295
3296   return progress;
3297}
3298
3299/**
3300 * Walks through basic blocks, locking for repeated MRF writes and
3301 * removing the later ones.
3302 */
3303bool
3304fs_visitor::remove_duplicate_mrf_writes()
3305{
3306   fs_inst *last_mrf_move[16];
3307   bool progress = false;
3308
3309   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3310
3311   foreach_iter(exec_list_iterator, iter, this->instructions) {
3312      fs_inst *inst = (fs_inst *)iter.get();
3313
3314      switch (inst->opcode) {
3315      case BRW_OPCODE_DO:
3316      case BRW_OPCODE_WHILE:
3317      case BRW_OPCODE_IF:
3318      case BRW_OPCODE_ELSE:
3319      case BRW_OPCODE_ENDIF:
3320	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3321	 continue;
3322      default:
3323	 break;
3324      }
3325
3326      if (inst->opcode == BRW_OPCODE_MOV &&
3327	  inst->dst.file == MRF) {
3328	 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3329	 if (prev_inst && inst->equals(prev_inst)) {
3330	    inst->remove();
3331	    progress = true;
3332	    continue;
3333	 }
3334      }
3335
3336      /* Clear out the last-write records for MRFs that were overwritten. */
3337      if (inst->dst.file == MRF) {
3338	 last_mrf_move[inst->dst.hw_reg] = NULL;
3339      }
3340
3341      if (inst->mlen > 0) {
3342	 /* Found a SEND instruction, which will include two or fewer
3343	  * implied MRF writes.  We could do better here.
3344	  */
3345	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3346	    last_mrf_move[inst->base_mrf + i] = NULL;
3347	 }
3348      }
3349
3350      /* Clear out any MRF move records whose sources got overwritten. */
3351      if (inst->dst.file == GRF) {
3352	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3353	    if (last_mrf_move[i] &&
3354		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3355	       last_mrf_move[i] = NULL;
3356	    }
3357	 }
3358      }
3359
3360      if (inst->opcode == BRW_OPCODE_MOV &&
3361	  inst->dst.file == MRF &&
3362	  inst->src[0].file == GRF &&
3363	  !inst->predicated) {
3364	 last_mrf_move[inst->dst.hw_reg] = inst;
3365      }
3366   }
3367
3368   return progress;
3369}
3370
3371bool
3372fs_visitor::virtual_grf_interferes(int a, int b)
3373{
3374   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3375   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3376
3377   /* We can't handle dead register writes here, without iterating
3378    * over the whole instruction stream to find every single dead
3379    * write to that register to compare to the live interval of the
3380    * other register.  Just assert that dead_code_eliminate() has been
3381    * called.
3382    */
3383   assert((this->virtual_grf_use[a] != -1 ||
3384	   this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3385	  (this->virtual_grf_use[b] != -1 ||
3386	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
3387
3388   return start < end;
3389}
3390
3391static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3392{
3393   struct brw_reg brw_reg;
3394
3395   switch (reg->file) {
3396   case GRF:
3397   case ARF:
3398   case MRF:
3399      if (reg->smear == -1) {
3400	 brw_reg = brw_vec8_reg(reg->file,
3401				reg->hw_reg, 0);
3402      } else {
3403	 brw_reg = brw_vec1_reg(reg->file,
3404				reg->hw_reg, reg->smear);
3405      }
3406      brw_reg = retype(brw_reg, reg->type);
3407      break;
3408   case IMM:
3409      switch (reg->type) {
3410      case BRW_REGISTER_TYPE_F:
3411	 brw_reg = brw_imm_f(reg->imm.f);
3412	 break;
3413      case BRW_REGISTER_TYPE_D:
3414	 brw_reg = brw_imm_d(reg->imm.i);
3415	 break;
3416      case BRW_REGISTER_TYPE_UD:
3417	 brw_reg = brw_imm_ud(reg->imm.u);
3418	 break;
3419      default:
3420	 assert(!"not reached");
3421	 brw_reg = brw_null_reg();
3422	 break;
3423      }
3424      break;
3425   case FIXED_HW_REG:
3426      brw_reg = reg->fixed_hw_reg;
3427      break;
3428   case BAD_FILE:
3429      /* Probably unused. */
3430      brw_reg = brw_null_reg();
3431      break;
3432   case UNIFORM:
3433      assert(!"not reached");
3434      brw_reg = brw_null_reg();
3435      break;
3436   default:
3437      assert(!"not reached");
3438      brw_reg = brw_null_reg();
3439      break;
3440   }
3441   if (reg->abs)
3442      brw_reg = brw_abs(brw_reg);
3443   if (reg->negate)
3444      brw_reg = negate(brw_reg);
3445
3446   return brw_reg;
3447}
3448
3449void
3450fs_visitor::generate_code()
3451{
3452   int last_native_inst = 0;
3453   const char *last_annotation_string = NULL;
3454   ir_instruction *last_annotation_ir = NULL;
3455
3456   int if_stack_array_size = 16;
3457   int loop_stack_array_size = 16;
3458   int if_stack_depth = 0, loop_stack_depth = 0;
3459   brw_instruction **if_stack =
3460      rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size);
3461   brw_instruction **loop_stack =
3462      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
3463   int *if_depth_in_loop =
3464      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
3465
3466
3467   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3468      printf("Native code for fragment shader %d:\n",
3469	     ctx->Shader.CurrentFragmentProgram->Name);
3470   }
3471
3472   foreach_iter(exec_list_iterator, iter, this->instructions) {
3473      fs_inst *inst = (fs_inst *)iter.get();
3474      struct brw_reg src[3], dst;
3475
3476      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3477	 if (last_annotation_ir != inst->ir) {
3478	    last_annotation_ir = inst->ir;
3479	    if (last_annotation_ir) {
3480	       printf("   ");
3481	       last_annotation_ir->print();
3482	       printf("\n");
3483	    }
3484	 }
3485	 if (last_annotation_string != inst->annotation) {
3486	    last_annotation_string = inst->annotation;
3487	    if (last_annotation_string)
3488	       printf("   %s\n", last_annotation_string);
3489	 }
3490      }
3491
3492      for (unsigned int i = 0; i < 3; i++) {
3493	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3494      }
3495      dst = brw_reg_from_fs_reg(&inst->dst);
3496
3497      brw_set_conditionalmod(p, inst->conditional_mod);
3498      brw_set_predicate_control(p, inst->predicated);
3499      brw_set_saturate(p, inst->saturate);
3500
3501      switch (inst->opcode) {
3502      case BRW_OPCODE_MOV:
3503	 brw_MOV(p, dst, src[0]);
3504	 break;
3505      case BRW_OPCODE_ADD:
3506	 brw_ADD(p, dst, src[0], src[1]);
3507	 break;
3508      case BRW_OPCODE_MUL:
3509	 brw_MUL(p, dst, src[0], src[1]);
3510	 break;
3511
3512      case BRW_OPCODE_FRC:
3513	 brw_FRC(p, dst, src[0]);
3514	 break;
3515      case BRW_OPCODE_RNDD:
3516	 brw_RNDD(p, dst, src[0]);
3517	 break;
3518      case BRW_OPCODE_RNDE:
3519	 brw_RNDE(p, dst, src[0]);
3520	 break;
3521      case BRW_OPCODE_RNDZ:
3522	 brw_RNDZ(p, dst, src[0]);
3523	 break;
3524
3525      case BRW_OPCODE_AND:
3526	 brw_AND(p, dst, src[0], src[1]);
3527	 break;
3528      case BRW_OPCODE_OR:
3529	 brw_OR(p, dst, src[0], src[1]);
3530	 break;
3531      case BRW_OPCODE_XOR:
3532	 brw_XOR(p, dst, src[0], src[1]);
3533	 break;
3534      case BRW_OPCODE_NOT:
3535	 brw_NOT(p, dst, src[0]);
3536	 break;
3537      case BRW_OPCODE_ASR:
3538	 brw_ASR(p, dst, src[0], src[1]);
3539	 break;
3540      case BRW_OPCODE_SHR:
3541	 brw_SHR(p, dst, src[0], src[1]);
3542	 break;
3543      case BRW_OPCODE_SHL:
3544	 brw_SHL(p, dst, src[0], src[1]);
3545	 break;
3546
3547      case BRW_OPCODE_CMP:
3548	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3549	 break;
3550      case BRW_OPCODE_SEL:
3551	 brw_SEL(p, dst, src[0], src[1]);
3552	 break;
3553
3554      case BRW_OPCODE_IF:
3555	 if (inst->src[0].file != BAD_FILE) {
3556	    assert(intel->gen >= 6);
3557	    if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]);
3558	 } else {
3559	    if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3560	 }
3561	 if_depth_in_loop[loop_stack_depth]++;
3562	 if_stack_depth++;
3563	 if (if_stack_array_size <= if_stack_depth) {
3564	    if_stack_array_size *= 2;
3565	    if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *,
3566			        if_stack_array_size);
3567	 }
3568	 break;
3569
3570      case BRW_OPCODE_ELSE:
3571	 if_stack[if_stack_depth - 1] =
3572	    brw_ELSE(p, if_stack[if_stack_depth - 1]);
3573	 break;
3574      case BRW_OPCODE_ENDIF:
3575	 if_stack_depth--;
3576	 brw_ENDIF(p , if_stack[if_stack_depth]);
3577	 if_depth_in_loop[loop_stack_depth]--;
3578	 break;
3579
3580      case BRW_OPCODE_DO:
3581	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3582	 if (loop_stack_array_size <= loop_stack_depth) {
3583	    loop_stack_array_size *= 2;
3584	    loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
3585				  loop_stack_array_size);
3586	    if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
3587				        loop_stack_array_size);
3588	 }
3589	 if_depth_in_loop[loop_stack_depth] = 0;
3590	 break;
3591
3592      case BRW_OPCODE_BREAK:
3593	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3594	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3595	 break;
3596      case BRW_OPCODE_CONTINUE:
3597	 /* FINISHME: We need to write the loop instruction support still. */
3598	 if (intel->gen >= 6)
3599	    gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
3600	 else
3601	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3602	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3603	 break;
3604
3605      case BRW_OPCODE_WHILE: {
3606	 struct brw_instruction *inst0, *inst1;
3607	 GLuint br = 1;
3608
3609	 if (intel->gen >= 5)
3610	    br = 2;
3611
3612	 assert(loop_stack_depth > 0);
3613	 loop_stack_depth--;
3614	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3615	 if (intel->gen < 6) {
3616	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
3617	    while (inst0 > loop_stack[loop_stack_depth]) {
3618	       inst0--;
3619	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3620		   inst0->bits3.if_else.jump_count == 0) {
3621		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3622	    }
3623	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3624			inst0->bits3.if_else.jump_count == 0) {
3625		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3626	       }
3627	    }
3628	 }
3629      }
3630	 break;
3631
3632      case FS_OPCODE_RCP:
3633      case FS_OPCODE_RSQ:
3634      case FS_OPCODE_SQRT:
3635      case FS_OPCODE_EXP2:
3636      case FS_OPCODE_LOG2:
3637      case FS_OPCODE_POW:
3638      case FS_OPCODE_SIN:
3639      case FS_OPCODE_COS:
3640	 generate_math(inst, dst, src);
3641	 break;
3642      case FS_OPCODE_CINTERP:
3643	 brw_MOV(p, dst, src[0]);
3644	 break;
3645      case FS_OPCODE_LINTERP:
3646	 generate_linterp(inst, dst, src);
3647	 break;
3648      case FS_OPCODE_TEX:
3649      case FS_OPCODE_TXB:
3650      case FS_OPCODE_TXD:
3651      case FS_OPCODE_TXL:
3652	 generate_tex(inst, dst, src[0]);
3653	 break;
3654      case FS_OPCODE_DISCARD_NOT:
3655	 generate_discard_not(inst, dst);
3656	 break;
3657      case FS_OPCODE_DISCARD_AND:
3658	 generate_discard_and(inst, src[0]);
3659	 break;
3660      case FS_OPCODE_DDX:
3661	 generate_ddx(inst, dst, src[0]);
3662	 break;
3663      case FS_OPCODE_DDY:
3664	 generate_ddy(inst, dst, src[0]);
3665	 break;
3666
3667      case FS_OPCODE_SPILL:
3668	 generate_spill(inst, src[0]);
3669	 break;
3670
3671      case FS_OPCODE_UNSPILL:
3672	 generate_unspill(inst, dst);
3673	 break;
3674
3675      case FS_OPCODE_PULL_CONSTANT_LOAD:
3676	 generate_pull_constant_load(inst, dst);
3677	 break;
3678
3679      case FS_OPCODE_FB_WRITE:
3680	 generate_fb_write(inst);
3681	 break;
3682      default:
3683	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3684	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3685			  brw_opcodes[inst->opcode].name);
3686	 } else {
3687	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3688	 }
3689	 this->fail = true;
3690      }
3691
3692      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3693	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3694	    if (0) {
3695	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3696		      ((uint32_t *)&p->store[i])[3],
3697		      ((uint32_t *)&p->store[i])[2],
3698		      ((uint32_t *)&p->store[i])[1],
3699		      ((uint32_t *)&p->store[i])[0]);
3700	    }
3701	    brw_disasm(stdout, &p->store[i], intel->gen);
3702	 }
3703      }
3704
3705      last_native_inst = p->nr_insn;
3706   }
3707
3708   ralloc_free(if_stack);
3709   ralloc_free(loop_stack);
3710   ralloc_free(if_depth_in_loop);
3711
3712   brw_set_uip_jip(p);
3713
3714   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3715    * emit issues, it doesn't get the jump distances into the output,
3716    * which is often something we want to debug.  So this is here in
3717    * case you're doing that.
3718    */
3719   if (0) {
3720      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3721	 for (unsigned int i = 0; i < p->nr_insn; i++) {
3722	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3723		   ((uint32_t *)&p->store[i])[3],
3724		   ((uint32_t *)&p->store[i])[2],
3725		   ((uint32_t *)&p->store[i])[1],
3726		   ((uint32_t *)&p->store[i])[0]);
3727	    brw_disasm(stdout, &p->store[i], intel->gen);
3728	 }
3729      }
3730   }
3731}
3732
3733GLboolean
3734brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3735{
3736   struct intel_context *intel = &brw->intel;
3737   struct gl_context *ctx = &intel->ctx;
3738   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3739
3740   if (!prog)
3741      return GL_FALSE;
3742
3743   struct brw_shader *shader =
3744     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3745   if (!shader)
3746      return GL_FALSE;
3747
3748   /* We always use 8-wide mode, at least for now.  For one, flow
3749    * control only works in 8-wide.  Also, when we're fragment shader
3750    * bound, we're almost always under register pressure as well, so
3751    * 8-wide would save us from the performance cliff of spilling
3752    * regs.
3753    */
3754   c->dispatch_width = 8;
3755
3756   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3757      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3758      _mesa_print_ir(shader->ir, NULL);
3759      printf("\n");
3760   }
3761
3762   /* Now the main event: Visit the shader IR and generate our FS IR for it.
3763    */
3764   fs_visitor v(c, shader);
3765
3766   if (0) {
3767      v.emit_dummy_fs();
3768   } else {
3769      v.calculate_urb_setup();
3770      if (intel->gen < 6)
3771	 v.emit_interpolation_setup_gen4();
3772      else
3773	 v.emit_interpolation_setup_gen6();
3774
3775      /* Generate FS IR for main().  (the visitor only descends into
3776       * functions called "main").
3777       */
3778      foreach_iter(exec_list_iterator, iter, *shader->ir) {
3779	 ir_instruction *ir = (ir_instruction *)iter.get();
3780	 v.base_ir = ir;
3781	 ir->accept(&v);
3782      }
3783
3784      v.emit_fb_writes();
3785
3786      v.split_virtual_grfs();
3787
3788      v.setup_paramvalues_refs();
3789      v.setup_pull_constants();
3790
3791      bool progress;
3792      do {
3793	 progress = false;
3794
3795	 progress = v.remove_duplicate_mrf_writes() || progress;
3796
3797	 progress = v.propagate_constants() || progress;
3798	 progress = v.register_coalesce() || progress;
3799	 progress = v.compute_to_mrf() || progress;
3800	 progress = v.dead_code_eliminate() || progress;
3801      } while (progress);
3802
3803      v.schedule_instructions();
3804
3805      v.assign_curb_setup();
3806      v.assign_urb_setup();
3807
3808      if (0) {
3809	 /* Debug of register spilling: Go spill everything. */
3810	 int virtual_grf_count = v.virtual_grf_next;
3811	 for (int i = 1; i < virtual_grf_count; i++) {
3812	    v.spill_reg(i);
3813	 }
3814      }
3815
3816      if (0)
3817	 v.assign_regs_trivial();
3818      else {
3819	 while (!v.assign_regs()) {
3820	    if (v.fail)
3821	       break;
3822	 }
3823      }
3824   }
3825
3826   if (!v.fail)
3827      v.generate_code();
3828
3829   assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
3830
3831   if (v.fail)
3832      return GL_FALSE;
3833
3834   c->prog_data.total_grf = v.grf_used;
3835
3836   return GL_TRUE;
3837}
3838