brw_fs.cpp revision b76378d46a211521582cfab56dc05031a57502a6
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31extern "C" {
32
33#include <sys/types.h>
34
35#include "main/macros.h"
36#include "main/shaderobj.h"
37#include "main/uniforms.h"
38#include "program/prog_parameter.h"
39#include "program/prog_print.h"
40#include "program/register_allocate.h"
41#include "program/sampler.h"
42#include "program/hash_table.h"
43#include "brw_context.h"
44#include "brw_eu.h"
45#include "brw_wm.h"
46}
47#include "brw_shader.h"
48#include "brw_fs.h"
49#include "../glsl/glsl_types.h"
50#include "../glsl/ir_print_visitor.h"
51
52#define MAX_INSTRUCTION (1 << 30)
53
54int
55fs_visitor::type_size(const struct glsl_type *type)
56{
57   unsigned int size, i;
58
59   switch (type->base_type) {
60   case GLSL_TYPE_UINT:
61   case GLSL_TYPE_INT:
62   case GLSL_TYPE_FLOAT:
63   case GLSL_TYPE_BOOL:
64      return type->components();
65   case GLSL_TYPE_ARRAY:
66      return type_size(type->fields.array) * type->length;
67   case GLSL_TYPE_STRUCT:
68      size = 0;
69      for (i = 0; i < type->length; i++) {
70	 size += type_size(type->fields.structure[i].type);
71      }
72      return size;
73   case GLSL_TYPE_SAMPLER:
74      /* Samplers take up no register space, since they're baked in at
75       * link time.
76       */
77      return 0;
78   default:
79      assert(!"not reached");
80      return 0;
81   }
82}
83
84void
85fs_visitor::fail(const char *format, ...)
86{
87   va_list va;
88   char *msg;
89
90   if (failed)
91      return;
92
93   failed = true;
94
95   va_start(va, format);
96   msg = ralloc_vasprintf(mem_ctx, format, va);
97   va_end(va);
98   msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
99
100   this->fail_msg = msg;
101
102   if (INTEL_DEBUG & DEBUG_WM) {
103      fprintf(stderr, "%s",  msg);
104   }
105}
106
107void
108fs_visitor::push_force_uncompressed()
109{
110   force_uncompressed_stack++;
111}
112
113void
114fs_visitor::pop_force_uncompressed()
115{
116   force_uncompressed_stack--;
117   assert(force_uncompressed_stack >= 0);
118}
119
120void
121fs_visitor::push_force_sechalf()
122{
123   force_sechalf_stack++;
124}
125
126void
127fs_visitor::pop_force_sechalf()
128{
129   force_sechalf_stack--;
130   assert(force_sechalf_stack >= 0);
131}
132
133/**
134 * Returns how many MRFs an FS opcode will write over.
135 *
136 * Note that this is not the 0 or 1 implied writes in an actual gen
137 * instruction -- the FS opcodes often generate MOVs in addition.
138 */
139int
140fs_visitor::implied_mrf_writes(fs_inst *inst)
141{
142   if (inst->mlen == 0)
143      return 0;
144
145   switch (inst->opcode) {
146   case FS_OPCODE_RCP:
147   case FS_OPCODE_RSQ:
148   case FS_OPCODE_SQRT:
149   case FS_OPCODE_EXP2:
150   case FS_OPCODE_LOG2:
151   case FS_OPCODE_SIN:
152   case FS_OPCODE_COS:
153      return 1 * c->dispatch_width / 8;
154   case FS_OPCODE_POW:
155      return 2 * c->dispatch_width / 8;
156   case FS_OPCODE_TEX:
157   case FS_OPCODE_TXB:
158   case FS_OPCODE_TXD:
159   case FS_OPCODE_TXL:
160      return 1;
161   case FS_OPCODE_FB_WRITE:
162      return 2;
163   case FS_OPCODE_PULL_CONSTANT_LOAD:
164   case FS_OPCODE_UNSPILL:
165      return 1;
166   case FS_OPCODE_SPILL:
167      return 2;
168   default:
169      assert(!"not reached");
170      return inst->mlen;
171   }
172}
173
174int
175fs_visitor::virtual_grf_alloc(int size)
176{
177   if (virtual_grf_array_size <= virtual_grf_next) {
178      if (virtual_grf_array_size == 0)
179	 virtual_grf_array_size = 16;
180      else
181	 virtual_grf_array_size *= 2;
182      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
183				   virtual_grf_array_size);
184   }
185   virtual_grf_sizes[virtual_grf_next] = size;
186   return virtual_grf_next++;
187}
188
189/** Fixed HW reg constructor. */
190fs_reg::fs_reg(enum register_file file, int hw_reg)
191{
192   init();
193   this->file = file;
194   this->hw_reg = hw_reg;
195   this->type = BRW_REGISTER_TYPE_F;
196}
197
198/** Fixed HW reg constructor. */
199fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
200{
201   init();
202   this->file = file;
203   this->hw_reg = hw_reg;
204   this->type = type;
205}
206
207/** Automatic reg constructor. */
208fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
209{
210   init();
211
212   this->file = GRF;
213   this->reg = v->virtual_grf_alloc(v->type_size(type));
214   this->reg_offset = 0;
215   this->type = brw_type_for_base_type(type);
216}
217
218fs_reg *
219fs_visitor::variable_storage(ir_variable *var)
220{
221   return (fs_reg *)hash_table_find(this->variable_ht, var);
222}
223
224void
225import_uniforms_callback(const void *key,
226			 void *data,
227			 void *closure)
228{
229   struct hash_table *dst_ht = (struct hash_table *)closure;
230   const fs_reg *reg = (const fs_reg *)data;
231
232   if (reg->file != UNIFORM)
233      return;
234
235   hash_table_insert(dst_ht, data, key);
236}
237
238/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
239 * This brings in those uniform definitions
240 */
241void
242fs_visitor::import_uniforms(fs_visitor *v)
243{
244   hash_table_call_foreach(v->variable_ht,
245			   import_uniforms_callback,
246			   variable_ht);
247   this->params_remap = v->params_remap;
248}
249
250/* Our support for uniforms is piggy-backed on the struct
251 * gl_fragment_program, because that's where the values actually
252 * get stored, rather than in some global gl_shader_program uniform
253 * store.
254 */
255int
256fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
257{
258   unsigned int offset = 0;
259
260   if (type->is_matrix()) {
261      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
262							type->vector_elements,
263							1);
264
265      for (unsigned int i = 0; i < type->matrix_columns; i++) {
266	 offset += setup_uniform_values(loc + offset, column);
267      }
268
269      return offset;
270   }
271
272   switch (type->base_type) {
273   case GLSL_TYPE_FLOAT:
274   case GLSL_TYPE_UINT:
275   case GLSL_TYPE_INT:
276   case GLSL_TYPE_BOOL:
277      for (unsigned int i = 0; i < type->vector_elements; i++) {
278	 unsigned int param = c->prog_data.nr_params++;
279
280	 assert(param < ARRAY_SIZE(c->prog_data.param));
281
282	 switch (type->base_type) {
283	 case GLSL_TYPE_FLOAT:
284	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
285	    break;
286	 case GLSL_TYPE_UINT:
287	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
288	    break;
289	 case GLSL_TYPE_INT:
290	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
291	    break;
292	 case GLSL_TYPE_BOOL:
293	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
294	    break;
295	 default:
296	    assert(!"not reached");
297	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
298	    break;
299	 }
300	 this->param_index[param] = loc;
301	 this->param_offset[param] = i;
302      }
303      return 1;
304
305   case GLSL_TYPE_STRUCT:
306      for (unsigned int i = 0; i < type->length; i++) {
307	 offset += setup_uniform_values(loc + offset,
308					type->fields.structure[i].type);
309      }
310      return offset;
311
312   case GLSL_TYPE_ARRAY:
313      for (unsigned int i = 0; i < type->length; i++) {
314	 offset += setup_uniform_values(loc + offset, type->fields.array);
315      }
316      return offset;
317
318   case GLSL_TYPE_SAMPLER:
319      /* The sampler takes up a slot, but we don't use any values from it. */
320      return 1;
321
322   default:
323      assert(!"not reached");
324      return 0;
325   }
326}
327
328
329/* Our support for builtin uniforms is even scarier than non-builtin.
330 * It sits on top of the PROG_STATE_VAR parameters that are
331 * automatically updated from GL context state.
332 */
333void
334fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
335{
336   const ir_state_slot *const slots = ir->state_slots;
337   assert(ir->state_slots != NULL);
338
339   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
340      /* This state reference has already been setup by ir_to_mesa, but we'll
341       * get the same index back here.
342       */
343      int index = _mesa_add_state_reference(this->fp->Base.Parameters,
344					    (gl_state_index *)slots[i].tokens);
345
346      /* Add each of the unique swizzles of the element as a parameter.
347       * This'll end up matching the expected layout of the
348       * array/matrix/structure we're trying to fill in.
349       */
350      int last_swiz = -1;
351      for (unsigned int j = 0; j < 4; j++) {
352	 int swiz = GET_SWZ(slots[i].swizzle, j);
353	 if (swiz == last_swiz)
354	    break;
355	 last_swiz = swiz;
356
357	 c->prog_data.param_convert[c->prog_data.nr_params] =
358	    PARAM_NO_CONVERT;
359	 this->param_index[c->prog_data.nr_params] = index;
360	 this->param_offset[c->prog_data.nr_params] = swiz;
361	 c->prog_data.nr_params++;
362      }
363   }
364}
365
366fs_reg *
367fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
368{
369   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
370   fs_reg wpos = *reg;
371   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
372
373   /* gl_FragCoord.x */
374   if (ir->pixel_center_integer) {
375      emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
376   } else {
377      emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
378   }
379   wpos.reg_offset++;
380
381   /* gl_FragCoord.y */
382   if (!flip && ir->pixel_center_integer) {
383      emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
384   } else {
385      fs_reg pixel_y = this->pixel_y;
386      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
387
388      if (flip) {
389	 pixel_y.negate = true;
390	 offset += c->key.drawable_height - 1.0;
391      }
392
393      emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
394   }
395   wpos.reg_offset++;
396
397   /* gl_FragCoord.z */
398   if (intel->gen >= 6) {
399      emit(BRW_OPCODE_MOV, wpos,
400	   fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
401   } else {
402      emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
403	   interp_reg(FRAG_ATTRIB_WPOS, 2));
404   }
405   wpos.reg_offset++;
406
407   /* gl_FragCoord.w: Already set up in emit_interpolation */
408   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
409
410   return reg;
411}
412
413fs_reg *
414fs_visitor::emit_general_interpolation(ir_variable *ir)
415{
416   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
417   /* Interpolation is always in floating point regs. */
418   reg->type = BRW_REGISTER_TYPE_F;
419   fs_reg attr = *reg;
420
421   unsigned int array_elements;
422   const glsl_type *type;
423
424   if (ir->type->is_array()) {
425      array_elements = ir->type->length;
426      if (array_elements == 0) {
427	 fail("dereferenced array '%s' has length 0\n", ir->name);
428      }
429      type = ir->type->fields.array;
430   } else {
431      array_elements = 1;
432      type = ir->type;
433   }
434
435   int location = ir->location;
436   for (unsigned int i = 0; i < array_elements; i++) {
437      for (unsigned int j = 0; j < type->matrix_columns; j++) {
438	 if (urb_setup[location] == -1) {
439	    /* If there's no incoming setup data for this slot, don't
440	     * emit interpolation for it.
441	     */
442	    attr.reg_offset += type->vector_elements;
443	    location++;
444	    continue;
445	 }
446
447	 bool is_gl_Color =
448	    location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
449
450	 if (c->key.flat_shade && is_gl_Color) {
451	    /* Constant interpolation (flat shading) case. The SF has
452	     * handed us defined values in only the constant offset
453	     * field of the setup reg.
454	     */
455	    for (unsigned int k = 0; k < type->vector_elements; k++) {
456	       struct brw_reg interp = interp_reg(location, k);
457	       interp = suboffset(interp, 3);
458	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
459	       attr.reg_offset++;
460	    }
461	 } else {
462	    /* Perspective interpolation case. */
463	    for (unsigned int k = 0; k < type->vector_elements; k++) {
464	       /* FINISHME: At some point we probably want to push
465		* this farther by giving similar treatment to the
466		* other potentially constant components of the
467		* attribute, as well as making brw_vs_constval.c
468		* handle varyings other than gl_TexCoord.
469		*/
470	       if (location >= FRAG_ATTRIB_TEX0 &&
471		   location <= FRAG_ATTRIB_TEX7 &&
472		   k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
473		  emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
474	       } else {
475		  struct brw_reg interp = interp_reg(location, k);
476		  emit(FS_OPCODE_LINTERP, attr,
477		       this->delta_x, this->delta_y, fs_reg(interp));
478	       }
479	       attr.reg_offset++;
480	    }
481
482	    if (intel->gen < 6) {
483	       attr.reg_offset -= type->vector_elements;
484	       for (unsigned int k = 0; k < type->vector_elements; k++) {
485		  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
486		  attr.reg_offset++;
487	       }
488	    }
489	 }
490	 location++;
491      }
492   }
493
494   return reg;
495}
496
497fs_reg *
498fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
499{
500   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
501
502   /* The frontfacing comes in as a bit in the thread payload. */
503   if (intel->gen >= 6) {
504      emit(BRW_OPCODE_ASR, *reg,
505	   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
506	   fs_reg(15));
507      emit(BRW_OPCODE_NOT, *reg, *reg);
508      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
509   } else {
510      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
511      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
512       * us front face
513       */
514      fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
515			   fs_reg(r1_6ud),
516			   fs_reg(1u << 31));
517      inst->conditional_mod = BRW_CONDITIONAL_L;
518      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
519   }
520
521   return reg;
522}
523
524fs_inst *
525fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
526{
527   switch (opcode) {
528   case FS_OPCODE_RCP:
529   case FS_OPCODE_RSQ:
530   case FS_OPCODE_SQRT:
531   case FS_OPCODE_EXP2:
532   case FS_OPCODE_LOG2:
533   case FS_OPCODE_SIN:
534   case FS_OPCODE_COS:
535      break;
536   default:
537      assert(!"not reached: bad math opcode");
538      return NULL;
539   }
540
541   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
542    * might be able to do better by doing execsize = 1 math and then
543    * expanding that result out, but we would need to be careful with
544    * masking.
545    *
546    * The hardware ignores source modifiers (negate and abs) on math
547    * instructions, so we also move to a temp to set those up.
548    */
549   if (intel->gen >= 6 && (src.file == UNIFORM ||
550			   src.abs ||
551			   src.negate)) {
552      fs_reg expanded = fs_reg(this, glsl_type::float_type);
553      emit(BRW_OPCODE_MOV, expanded, src);
554      src = expanded;
555   }
556
557   fs_inst *inst = emit(opcode, dst, src);
558
559   if (intel->gen < 6) {
560      inst->base_mrf = 2;
561      inst->mlen = c->dispatch_width / 8;
562   }
563
564   return inst;
565}
566
567fs_inst *
568fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
569{
570   int base_mrf = 2;
571   fs_inst *inst;
572
573   assert(opcode == FS_OPCODE_POW);
574
575   if (intel->gen >= 6) {
576      /* Can't do hstride == 0 args to gen6 math, so expand it out.
577       *
578       * The hardware ignores source modifiers (negate and abs) on math
579       * instructions, so we also move to a temp to set those up.
580       */
581      if (src0.file == UNIFORM || src0.abs || src0.negate) {
582	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
583	 emit(BRW_OPCODE_MOV, expanded, src0);
584	 src0 = expanded;
585      }
586
587      if (src1.file == UNIFORM || src1.abs || src1.negate) {
588	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
589	 emit(BRW_OPCODE_MOV, expanded, src1);
590	 src1 = expanded;
591      }
592
593      inst = emit(opcode, dst, src0, src1);
594   } else {
595      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
596      inst = emit(opcode, dst, src0, reg_null_f);
597
598      inst->base_mrf = base_mrf;
599      inst->mlen = 2 * c->dispatch_width / 8;
600   }
601   return inst;
602}
603
604/**
605 * To be called after the last _mesa_add_state_reference() call, to
606 * set up prog_data.param[] for assign_curb_setup() and
607 * setup_pull_constants().
608 */
609void
610fs_visitor::setup_paramvalues_refs()
611{
612   if (c->dispatch_width != 8)
613      return;
614
615   /* Set up the pointers to ParamValues now that that array is finalized. */
616   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
617      c->prog_data.param[i] =
618	 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
619	 this->param_offset[i];
620   }
621}
622
623void
624fs_visitor::assign_curb_setup()
625{
626   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
627   if (c->dispatch_width == 8) {
628      c->prog_data.first_curbe_grf = c->nr_payload_regs;
629   } else {
630      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
631   }
632
633   /* Map the offsets in the UNIFORM file to fixed HW regs. */
634   foreach_list(node, &this->instructions) {
635      fs_inst *inst = (fs_inst *)node;
636
637      for (unsigned int i = 0; i < 3; i++) {
638	 if (inst->src[i].file == UNIFORM) {
639	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
640	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
641						  constant_nr / 8,
642						  constant_nr % 8);
643
644	    inst->src[i].file = FIXED_HW_REG;
645	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
646	 }
647      }
648   }
649}
650
651void
652fs_visitor::calculate_urb_setup()
653{
654   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
655      urb_setup[i] = -1;
656   }
657
658   int urb_next = 0;
659   /* Figure out where each of the incoming setup attributes lands. */
660   if (intel->gen >= 6) {
661      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
662	 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
663	    urb_setup[i] = urb_next++;
664	 }
665      }
666   } else {
667      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
668      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
669	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
670	    int fp_index;
671
672	    if (i >= VERT_RESULT_VAR0)
673	       fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
674	    else if (i <= VERT_RESULT_TEX7)
675	       fp_index = i;
676	    else
677	       fp_index = -1;
678
679	    if (fp_index >= 0)
680	       urb_setup[fp_index] = urb_next++;
681	 }
682      }
683   }
684
685   /* Each attribute is 4 setup channels, each of which is half a reg. */
686   c->prog_data.urb_read_length = urb_next * 2;
687}
688
689void
690fs_visitor::assign_urb_setup()
691{
692   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
693
694   /* Offset all the urb_setup[] index by the actual position of the
695    * setup regs, now that the location of the constants has been chosen.
696    */
697   foreach_list(node, &this->instructions) {
698      fs_inst *inst = (fs_inst *)node;
699
700      if (inst->opcode == FS_OPCODE_LINTERP) {
701	 assert(inst->src[2].file == FIXED_HW_REG);
702	 inst->src[2].fixed_hw_reg.nr += urb_start;
703      }
704
705      if (inst->opcode == FS_OPCODE_CINTERP) {
706	 assert(inst->src[0].file == FIXED_HW_REG);
707	 inst->src[0].fixed_hw_reg.nr += urb_start;
708      }
709   }
710
711   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
712}
713
714/**
715 * Split large virtual GRFs into separate components if we can.
716 *
717 * This is mostly duplicated with what brw_fs_vector_splitting does,
718 * but that's really conservative because it's afraid of doing
719 * splitting that doesn't result in real progress after the rest of
720 * the optimization phases, which would cause infinite looping in
721 * optimization.  We can do it once here, safely.  This also has the
722 * opportunity to split interpolated values, or maybe even uniforms,
723 * which we don't have at the IR level.
724 *
725 * We want to split, because virtual GRFs are what we register
726 * allocate and spill (due to contiguousness requirements for some
727 * instructions), and they're what we naturally generate in the
728 * codegen process, but most virtual GRFs don't actually need to be
729 * contiguous sets of GRFs.  If we split, we'll end up with reduced
730 * live intervals and better dead code elimination and coalescing.
731 */
732void
733fs_visitor::split_virtual_grfs()
734{
735   int num_vars = this->virtual_grf_next;
736   bool split_grf[num_vars];
737   int new_virtual_grf[num_vars];
738
739   /* Try to split anything > 0 sized. */
740   for (int i = 0; i < num_vars; i++) {
741      if (this->virtual_grf_sizes[i] != 1)
742	 split_grf[i] = true;
743      else
744	 split_grf[i] = false;
745   }
746
747   if (brw->has_pln) {
748      /* PLN opcodes rely on the delta_xy being contiguous. */
749      split_grf[this->delta_x.reg] = false;
750   }
751
752   foreach_list(node, &this->instructions) {
753      fs_inst *inst = (fs_inst *)node;
754
755      /* Texturing produces 4 contiguous registers, so no splitting. */
756      if (inst->is_tex()) {
757	 split_grf[inst->dst.reg] = false;
758      }
759   }
760
761   /* Allocate new space for split regs.  Note that the virtual
762    * numbers will be contiguous.
763    */
764   for (int i = 0; i < num_vars; i++) {
765      if (split_grf[i]) {
766	 new_virtual_grf[i] = virtual_grf_alloc(1);
767	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
768	    int reg = virtual_grf_alloc(1);
769	    assert(reg == new_virtual_grf[i] + j - 1);
770	    (void) reg;
771	 }
772	 this->virtual_grf_sizes[i] = 1;
773      }
774   }
775
776   foreach_list(node, &this->instructions) {
777      fs_inst *inst = (fs_inst *)node;
778
779      if (inst->dst.file == GRF &&
780	  split_grf[inst->dst.reg] &&
781	  inst->dst.reg_offset != 0) {
782	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
783			  inst->dst.reg_offset - 1);
784	 inst->dst.reg_offset = 0;
785      }
786      for (int i = 0; i < 3; i++) {
787	 if (inst->src[i].file == GRF &&
788	     split_grf[inst->src[i].reg] &&
789	     inst->src[i].reg_offset != 0) {
790	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
791				inst->src[i].reg_offset - 1);
792	    inst->src[i].reg_offset = 0;
793	 }
794      }
795   }
796   this->live_intervals_valid = false;
797}
798
799bool
800fs_visitor::remove_dead_constants()
801{
802   if (c->dispatch_width == 8) {
803      this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
804
805      for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
806	 this->params_remap[i] = -1;
807
808      /* Find which params are still in use. */
809      foreach_list(node, &this->instructions) {
810	 fs_inst *inst = (fs_inst *)node;
811
812	 for (int i = 0; i < 3; i++) {
813	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
814
815	    if (inst->src[i].file != UNIFORM)
816	       continue;
817
818	    assert(constant_nr < (int)c->prog_data.nr_params);
819
820	    /* For now, set this to non-negative.  We'll give it the
821	     * actual new number in a moment, in order to keep the
822	     * register numbers nicely ordered.
823	     */
824	    this->params_remap[constant_nr] = 0;
825	 }
826      }
827
828      /* Figure out what the new numbers for the params will be.  At some
829       * point when we're doing uniform array access, we're going to want
830       * to keep the distinction between .reg and .reg_offset, but for
831       * now we don't care.
832       */
833      unsigned int new_nr_params = 0;
834      for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
835	 if (this->params_remap[i] != -1) {
836	    this->params_remap[i] = new_nr_params++;
837	 }
838      }
839
840      /* Update the list of params to be uploaded to match our new numbering. */
841      for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
842	 int remapped = this->params_remap[i];
843
844	 if (remapped == -1)
845	    continue;
846
847	 /* We've already done setup_paramvalues_refs() so no need to worry
848	  * about param_index and param_offset.
849	  */
850	 c->prog_data.param[remapped] = c->prog_data.param[i];
851	 c->prog_data.param_convert[remapped] = c->prog_data.param_convert[i];
852      }
853
854      c->prog_data.nr_params = new_nr_params;
855   } else {
856      /* This should have been generated in the 8-wide pass already. */
857      assert(this->params_remap);
858   }
859
860   /* Now do the renumbering of the shader to remove unused params. */
861   foreach_list(node, &this->instructions) {
862      fs_inst *inst = (fs_inst *)node;
863
864      for (int i = 0; i < 3; i++) {
865	 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
866
867	 if (inst->src[i].file != UNIFORM)
868	    continue;
869
870	 assert(this->params_remap[constant_nr] != -1);
871	 inst->src[i].hw_reg = this->params_remap[constant_nr];
872	 inst->src[i].reg_offset = 0;
873      }
874   }
875
876   return true;
877}
878
879/**
880 * Choose accesses from the UNIFORM file to demote to using the pull
881 * constant buffer.
882 *
883 * We allow a fragment shader to have more than the specified minimum
884 * maximum number of fragment shader uniform components (64).  If
885 * there are too many of these, they'd fill up all of register space.
886 * So, this will push some of them out to the pull constant buffer and
887 * update the program to load them.
888 */
889void
890fs_visitor::setup_pull_constants()
891{
892   /* Only allow 16 registers (128 uniform components) as push constants. */
893   unsigned int max_uniform_components = 16 * 8;
894   if (c->prog_data.nr_params <= max_uniform_components)
895      return;
896
897   if (c->dispatch_width == 16) {
898      fail("Pull constants not supported in 16-wide\n");
899      return;
900   }
901
902   /* Just demote the end of the list.  We could probably do better
903    * here, demoting things that are rarely used in the program first.
904    */
905   int pull_uniform_base = max_uniform_components;
906   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
907
908   foreach_list(node, &this->instructions) {
909      fs_inst *inst = (fs_inst *)node;
910
911      for (int i = 0; i < 3; i++) {
912	 if (inst->src[i].file != UNIFORM)
913	    continue;
914
915	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
916	 if (uniform_nr < pull_uniform_base)
917	    continue;
918
919	 fs_reg dst = fs_reg(this, glsl_type::float_type);
920	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
921					      dst);
922	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
923	 pull->ir = inst->ir;
924	 pull->annotation = inst->annotation;
925	 pull->base_mrf = 14;
926	 pull->mlen = 1;
927
928	 inst->insert_before(pull);
929
930	 inst->src[i].file = GRF;
931	 inst->src[i].reg = dst.reg;
932	 inst->src[i].reg_offset = 0;
933	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
934      }
935   }
936
937   for (int i = 0; i < pull_uniform_count; i++) {
938      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
939      c->prog_data.pull_param_convert[i] =
940	 c->prog_data.param_convert[pull_uniform_base + i];
941   }
942   c->prog_data.nr_params -= pull_uniform_count;
943   c->prog_data.nr_pull_params = pull_uniform_count;
944}
945
946void
947fs_visitor::calculate_live_intervals()
948{
949   int num_vars = this->virtual_grf_next;
950   int *def = ralloc_array(mem_ctx, int, num_vars);
951   int *use = ralloc_array(mem_ctx, int, num_vars);
952   int loop_depth = 0;
953   int loop_start = 0;
954
955   if (this->live_intervals_valid)
956      return;
957
958   for (int i = 0; i < num_vars; i++) {
959      def[i] = MAX_INSTRUCTION;
960      use[i] = -1;
961   }
962
963   int ip = 0;
964   foreach_list(node, &this->instructions) {
965      fs_inst *inst = (fs_inst *)node;
966
967      if (inst->opcode == BRW_OPCODE_DO) {
968	 if (loop_depth++ == 0)
969	    loop_start = ip;
970      } else if (inst->opcode == BRW_OPCODE_WHILE) {
971	 loop_depth--;
972
973	 if (loop_depth == 0) {
974	    /* Patches up the use of vars marked for being live across
975	     * the whole loop.
976	     */
977	    for (int i = 0; i < num_vars; i++) {
978	       if (use[i] == loop_start) {
979		  use[i] = ip;
980	       }
981	    }
982	 }
983      } else {
984	 for (unsigned int i = 0; i < 3; i++) {
985	    if (inst->src[i].file == GRF) {
986	       int reg = inst->src[i].reg;
987
988	       if (!loop_depth) {
989		  use[reg] = ip;
990	       } else {
991		  def[reg] = MIN2(loop_start, def[reg]);
992		  use[reg] = loop_start;
993
994		  /* Nobody else is going to go smash our start to
995		   * later in the loop now, because def[reg] now
996		   * points before the bb header.
997		   */
998	       }
999	    }
1000	 }
1001	 if (inst->dst.file == GRF) {
1002	    int reg = inst->dst.reg;
1003
1004	    if (!loop_depth) {
1005	       def[reg] = MIN2(def[reg], ip);
1006	    } else {
1007	       def[reg] = MIN2(def[reg], loop_start);
1008	    }
1009	 }
1010      }
1011
1012      ip++;
1013   }
1014
1015   ralloc_free(this->virtual_grf_def);
1016   ralloc_free(this->virtual_grf_use);
1017   this->virtual_grf_def = def;
1018   this->virtual_grf_use = use;
1019
1020   this->live_intervals_valid = true;
1021}
1022
1023/**
1024 * Attempts to move immediate constants into the immediate
1025 * constant slot of following instructions.
1026 *
1027 * Immediate constants are a bit tricky -- they have to be in the last
1028 * operand slot, you can't do abs/negate on them,
1029 */
1030
1031bool
1032fs_visitor::propagate_constants()
1033{
1034   bool progress = false;
1035
1036   calculate_live_intervals();
1037
1038   foreach_list(node, &this->instructions) {
1039      fs_inst *inst = (fs_inst *)node;
1040
1041      if (inst->opcode != BRW_OPCODE_MOV ||
1042	  inst->predicated ||
1043	  inst->dst.file != GRF || inst->src[0].file != IMM ||
1044	  inst->dst.type != inst->src[0].type ||
1045	  (c->dispatch_width == 16 &&
1046	   (inst->force_uncompressed || inst->force_sechalf)))
1047	 continue;
1048
1049      /* Don't bother with cases where we should have had the
1050       * operation on the constant folded in GLSL already.
1051       */
1052      if (inst->saturate)
1053	 continue;
1054
1055      /* Found a move of a constant to a GRF.  Find anything else using the GRF
1056       * before it's written, and replace it with the constant if we can.
1057       */
1058      for (fs_inst *scan_inst = (fs_inst *)inst->next;
1059	   !scan_inst->is_tail_sentinel();
1060	   scan_inst = (fs_inst *)scan_inst->next) {
1061	 if (scan_inst->opcode == BRW_OPCODE_DO ||
1062	     scan_inst->opcode == BRW_OPCODE_WHILE ||
1063	     scan_inst->opcode == BRW_OPCODE_ELSE ||
1064	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
1065	    break;
1066	 }
1067
1068	 for (int i = 2; i >= 0; i--) {
1069	    if (scan_inst->src[i].file != GRF ||
1070		scan_inst->src[i].reg != inst->dst.reg ||
1071		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
1072	       continue;
1073
1074	    /* Don't bother with cases where we should have had the
1075	     * operation on the constant folded in GLSL already.
1076	     */
1077	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
1078	       continue;
1079
1080	    switch (scan_inst->opcode) {
1081	    case BRW_OPCODE_MOV:
1082	       scan_inst->src[i] = inst->src[0];
1083	       progress = true;
1084	       break;
1085
1086	    case BRW_OPCODE_MUL:
1087	    case BRW_OPCODE_ADD:
1088	       if (i == 1) {
1089		  scan_inst->src[i] = inst->src[0];
1090		  progress = true;
1091	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
1092		  /* Fit this constant in by commuting the operands */
1093		  scan_inst->src[0] = scan_inst->src[1];
1094		  scan_inst->src[1] = inst->src[0];
1095		  progress = true;
1096	       }
1097	       break;
1098
1099	    case BRW_OPCODE_CMP:
1100	       if (i == 1) {
1101		  scan_inst->src[i] = inst->src[0];
1102		  progress = true;
1103	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
1104		  uint32_t new_cmod;
1105
1106		  new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
1107		  if (new_cmod != ~0u) {
1108		     /* Fit this constant in by swapping the operands and
1109		      * flipping the test
1110		      */
1111		     scan_inst->src[0] = scan_inst->src[1];
1112		     scan_inst->src[1] = inst->src[0];
1113		     scan_inst->conditional_mod = new_cmod;
1114		     progress = true;
1115		  }
1116	       }
1117	       break;
1118
1119	    case BRW_OPCODE_SEL:
1120	       if (i == 1) {
1121		  scan_inst->src[i] = inst->src[0];
1122		  progress = true;
1123	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
1124		  scan_inst->src[0] = scan_inst->src[1];
1125		  scan_inst->src[1] = inst->src[0];
1126
1127		  /* If this was predicated, flipping operands means
1128		   * we also need to flip the predicate.
1129		   */
1130		  if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) {
1131		     scan_inst->predicate_inverse =
1132			!scan_inst->predicate_inverse;
1133		  }
1134		  progress = true;
1135	       }
1136	       break;
1137
1138	    case FS_OPCODE_RCP:
1139	       /* The hardware doesn't do math on immediate values
1140		* (because why are you doing that, seriously?), but
1141		* the correct answer is to just constant fold it
1142		* anyway.
1143		*/
1144	       assert(i == 0);
1145	       if (inst->src[0].imm.f != 0.0f) {
1146		  scan_inst->opcode = BRW_OPCODE_MOV;
1147		  scan_inst->src[0] = inst->src[0];
1148		  scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f;
1149		  progress = true;
1150	       }
1151	       break;
1152	    }
1153	 }
1154
1155	 if (scan_inst->dst.file == GRF &&
1156	     scan_inst->dst.reg == inst->dst.reg &&
1157	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1158	      scan_inst->is_tex())) {
1159	    break;
1160	 }
1161      }
1162   }
1163
1164   if (progress)
1165       this->live_intervals_valid = false;
1166
1167   return progress;
1168}
1169
1170
1171/**
1172 * Attempts to move immediate constants into the immediate
1173 * constant slot of following instructions.
1174 *
1175 * Immediate constants are a bit tricky -- they have to be in the last
1176 * operand slot, you can't do abs/negate on them,
1177 */
1178
1179bool
1180fs_visitor::opt_algebraic()
1181{
1182   bool progress = false;
1183
1184   calculate_live_intervals();
1185
1186   foreach_list(node, &this->instructions) {
1187      fs_inst *inst = (fs_inst *)node;
1188
1189      switch (inst->opcode) {
1190      case BRW_OPCODE_MUL:
1191	 if (inst->src[1].file != IMM)
1192	    continue;
1193
1194	 /* a * 1.0 = a */
1195	 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1196	     inst->src[1].imm.f == 1.0) {
1197	    inst->opcode = BRW_OPCODE_MOV;
1198	    inst->src[1] = reg_undef;
1199	    progress = true;
1200	    break;
1201	 }
1202
1203	 break;
1204      }
1205   }
1206
1207   return progress;
1208}
1209
1210/**
1211 * Must be called after calculate_live_intervales() to remove unused
1212 * writes to registers -- register allocation will fail otherwise
1213 * because something deffed but not used won't be considered to
1214 * interfere with other regs.
1215 */
1216bool
1217fs_visitor::dead_code_eliminate()
1218{
1219   bool progress = false;
1220   int pc = 0;
1221
1222   calculate_live_intervals();
1223
1224   foreach_list_safe(node, &this->instructions) {
1225      fs_inst *inst = (fs_inst *)node;
1226
1227      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1228	 inst->remove();
1229	 progress = true;
1230      }
1231
1232      pc++;
1233   }
1234
1235   if (progress)
1236      live_intervals_valid = false;
1237
1238   return progress;
1239}
1240
1241bool
1242fs_visitor::register_coalesce()
1243{
1244   bool progress = false;
1245   int if_depth = 0;
1246   int loop_depth = 0;
1247
1248   foreach_list_safe(node, &this->instructions) {
1249      fs_inst *inst = (fs_inst *)node;
1250
1251      /* Make sure that we dominate the instructions we're going to
1252       * scan for interfering with our coalescing, or we won't have
1253       * scanned enough to see if anything interferes with our
1254       * coalescing.  We don't dominate the following instructions if
1255       * we're in a loop or an if block.
1256       */
1257      switch (inst->opcode) {
1258      case BRW_OPCODE_DO:
1259	 loop_depth++;
1260	 break;
1261      case BRW_OPCODE_WHILE:
1262	 loop_depth--;
1263	 break;
1264      case BRW_OPCODE_IF:
1265	 if_depth++;
1266	 break;
1267      case BRW_OPCODE_ENDIF:
1268	 if_depth--;
1269	 break;
1270      }
1271      if (loop_depth || if_depth)
1272	 continue;
1273
1274      if (inst->opcode != BRW_OPCODE_MOV ||
1275	  inst->predicated ||
1276	  inst->saturate ||
1277	  inst->dst.file != GRF || (inst->src[0].file != GRF &&
1278				    inst->src[0].file != UNIFORM)||
1279	  inst->dst.type != inst->src[0].type)
1280	 continue;
1281
1282      bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1283
1284      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1285       * them: check for no writes to either one until the exit of the
1286       * program.
1287       */
1288      bool interfered = false;
1289
1290      for (fs_inst *scan_inst = (fs_inst *)inst->next;
1291	   !scan_inst->is_tail_sentinel();
1292	   scan_inst = (fs_inst *)scan_inst->next) {
1293	 if (scan_inst->dst.file == GRF) {
1294	    if (scan_inst->dst.reg == inst->dst.reg &&
1295		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1296		 scan_inst->is_tex())) {
1297	       interfered = true;
1298	       break;
1299	    }
1300	    if (inst->src[0].file == GRF &&
1301		scan_inst->dst.reg == inst->src[0].reg &&
1302		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
1303		 scan_inst->is_tex())) {
1304	       interfered = true;
1305	       break;
1306	    }
1307	 }
1308
1309	 /* The gen6 MATH instruction can't handle source modifiers or
1310	  * unusual register regions, so avoid coalescing those for
1311	  * now.  We should do something more specific.
1312	  */
1313	 if (intel->gen >= 6 &&
1314	     scan_inst->is_math() &&
1315	     (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1316	    interfered = true;
1317	    break;
1318	 }
1319      }
1320      if (interfered) {
1321	 continue;
1322      }
1323
1324      /* Rewrite the later usage to point at the source of the move to
1325       * be removed.
1326       */
1327      for (fs_inst *scan_inst = inst;
1328	   !scan_inst->is_tail_sentinel();
1329	   scan_inst = (fs_inst *)scan_inst->next) {
1330	 for (int i = 0; i < 3; i++) {
1331	    if (scan_inst->src[i].file == GRF &&
1332		scan_inst->src[i].reg == inst->dst.reg &&
1333		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1334	       fs_reg new_src = inst->src[0];
1335	       new_src.negate ^= scan_inst->src[i].negate;
1336	       new_src.abs |= scan_inst->src[i].abs;
1337	       scan_inst->src[i] = new_src;
1338	    }
1339	 }
1340      }
1341
1342      inst->remove();
1343      progress = true;
1344   }
1345
1346   if (progress)
1347      live_intervals_valid = false;
1348
1349   return progress;
1350}
1351
1352
1353bool
1354fs_visitor::compute_to_mrf()
1355{
1356   bool progress = false;
1357   int next_ip = 0;
1358
1359   calculate_live_intervals();
1360
1361   foreach_list_safe(node, &this->instructions) {
1362      fs_inst *inst = (fs_inst *)node;
1363
1364      int ip = next_ip;
1365      next_ip++;
1366
1367      if (inst->opcode != BRW_OPCODE_MOV ||
1368	  inst->predicated ||
1369	  inst->dst.file != MRF || inst->src[0].file != GRF ||
1370	  inst->dst.type != inst->src[0].type ||
1371	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1372	 continue;
1373
1374      /* Work out which hardware MRF registers are written by this
1375       * instruction.
1376       */
1377      int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
1378      int mrf_high;
1379      if (inst->dst.hw_reg & BRW_MRF_COMPR4) {
1380	 mrf_high = mrf_low + 4;
1381      } else if (c->dispatch_width == 16 &&
1382		 (!inst->force_uncompressed && !inst->force_sechalf)) {
1383	 mrf_high = mrf_low + 1;
1384      } else {
1385	 mrf_high = mrf_low;
1386      }
1387
1388      /* Can't compute-to-MRF this GRF if someone else was going to
1389       * read it later.
1390       */
1391      if (this->virtual_grf_use[inst->src[0].reg] > ip)
1392	 continue;
1393
1394      /* Found a move of a GRF to a MRF.  Let's see if we can go
1395       * rewrite the thing that made this GRF to write into the MRF.
1396       */
1397      fs_inst *scan_inst;
1398      for (scan_inst = (fs_inst *)inst->prev;
1399	   scan_inst->prev != NULL;
1400	   scan_inst = (fs_inst *)scan_inst->prev) {
1401	 if (scan_inst->dst.file == GRF &&
1402	     scan_inst->dst.reg == inst->src[0].reg) {
1403	    /* Found the last thing to write our reg we want to turn
1404	     * into a compute-to-MRF.
1405	     */
1406
1407	    if (scan_inst->is_tex()) {
1408	       /* texturing writes several continuous regs, so we can't
1409		* compute-to-mrf that.
1410		*/
1411	       break;
1412	    }
1413
1414	    /* If it's predicated, it (probably) didn't populate all
1415	     * the channels.  We might be able to rewrite everything
1416	     * that writes that reg, but it would require smarter
1417	     * tracking to delay the rewriting until complete success.
1418	     */
1419	    if (scan_inst->predicated)
1420	       break;
1421
1422	    /* If it's half of register setup and not the same half as
1423	     * our MOV we're trying to remove, bail for now.
1424	     */
1425	    if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1426		scan_inst->force_sechalf != inst->force_sechalf) {
1427	       break;
1428	    }
1429
1430	    /* SEND instructions can't have MRF as a destination. */
1431	    if (scan_inst->mlen)
1432	       break;
1433
1434	    if (intel->gen >= 6) {
1435	       /* gen6 math instructions must have the destination be
1436		* GRF, so no compute-to-MRF for them.
1437		*/
1438	       if (scan_inst->is_math()) {
1439		  break;
1440	       }
1441	    }
1442
1443	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1444	       /* Found the creator of our MRF's source value. */
1445	       scan_inst->dst.file = MRF;
1446	       scan_inst->dst.hw_reg = inst->dst.hw_reg;
1447	       scan_inst->saturate |= inst->saturate;
1448	       inst->remove();
1449	       progress = true;
1450	    }
1451	    break;
1452	 }
1453
1454	 /* We don't handle flow control here.  Most computation of
1455	  * values that end up in MRFs are shortly before the MRF
1456	  * write anyway.
1457	  */
1458	 if (scan_inst->opcode == BRW_OPCODE_DO ||
1459	     scan_inst->opcode == BRW_OPCODE_WHILE ||
1460	     scan_inst->opcode == BRW_OPCODE_ELSE ||
1461	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
1462	    break;
1463	 }
1464
1465	 /* You can't read from an MRF, so if someone else reads our
1466	  * MRF's source GRF that we wanted to rewrite, that stops us.
1467	  */
1468	 bool interfered = false;
1469	 for (int i = 0; i < 3; i++) {
1470	    if (scan_inst->src[i].file == GRF &&
1471		scan_inst->src[i].reg == inst->src[0].reg &&
1472		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1473	       interfered = true;
1474	    }
1475	 }
1476	 if (interfered)
1477	    break;
1478
1479	 if (scan_inst->dst.file == MRF) {
1480	    /* If somebody else writes our MRF here, we can't
1481	     * compute-to-MRF before that.
1482	     */
1483	    int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4;
1484	    int scan_mrf_high;
1485
1486	    if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) {
1487	       scan_mrf_high = scan_mrf_low + 4;
1488	    } else if (c->dispatch_width == 16 &&
1489		       (!scan_inst->force_uncompressed &&
1490			!scan_inst->force_sechalf)) {
1491	       scan_mrf_high = scan_mrf_low + 1;
1492	    } else {
1493	       scan_mrf_high = scan_mrf_low;
1494	    }
1495
1496	    if (mrf_low == scan_mrf_low ||
1497		mrf_low == scan_mrf_high ||
1498		mrf_high == scan_mrf_low ||
1499		mrf_high == scan_mrf_high) {
1500	       break;
1501	    }
1502	 }
1503
1504	 if (scan_inst->mlen > 0) {
1505	    /* Found a SEND instruction, which means that there are
1506	     * live values in MRFs from base_mrf to base_mrf +
1507	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
1508	     * above it.
1509	     */
1510	    if (mrf_low >= scan_inst->base_mrf &&
1511		mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1512	       break;
1513	    }
1514	    if (mrf_high >= scan_inst->base_mrf &&
1515		mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1516	       break;
1517	    }
1518	 }
1519      }
1520   }
1521
1522   return progress;
1523}
1524
1525/**
1526 * Walks through basic blocks, locking for repeated MRF writes and
1527 * removing the later ones.
1528 */
1529bool
1530fs_visitor::remove_duplicate_mrf_writes()
1531{
1532   fs_inst *last_mrf_move[16];
1533   bool progress = false;
1534
1535   /* Need to update the MRF tracking for compressed instructions. */
1536   if (c->dispatch_width == 16)
1537      return false;
1538
1539   memset(last_mrf_move, 0, sizeof(last_mrf_move));
1540
1541   foreach_list_safe(node, &this->instructions) {
1542      fs_inst *inst = (fs_inst *)node;
1543
1544      switch (inst->opcode) {
1545      case BRW_OPCODE_DO:
1546      case BRW_OPCODE_WHILE:
1547      case BRW_OPCODE_IF:
1548      case BRW_OPCODE_ELSE:
1549      case BRW_OPCODE_ENDIF:
1550	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1551	 continue;
1552      default:
1553	 break;
1554      }
1555
1556      if (inst->opcode == BRW_OPCODE_MOV &&
1557	  inst->dst.file == MRF) {
1558	 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
1559	 if (prev_inst && inst->equals(prev_inst)) {
1560	    inst->remove();
1561	    progress = true;
1562	    continue;
1563	 }
1564      }
1565
1566      /* Clear out the last-write records for MRFs that were overwritten. */
1567      if (inst->dst.file == MRF) {
1568	 last_mrf_move[inst->dst.hw_reg] = NULL;
1569      }
1570
1571      if (inst->mlen > 0) {
1572	 /* Found a SEND instruction, which will include two or fewer
1573	  * implied MRF writes.  We could do better here.
1574	  */
1575	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1576	    last_mrf_move[inst->base_mrf + i] = NULL;
1577	 }
1578      }
1579
1580      /* Clear out any MRF move records whose sources got overwritten. */
1581      if (inst->dst.file == GRF) {
1582	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1583	    if (last_mrf_move[i] &&
1584		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1585	       last_mrf_move[i] = NULL;
1586	    }
1587	 }
1588      }
1589
1590      if (inst->opcode == BRW_OPCODE_MOV &&
1591	  inst->dst.file == MRF &&
1592	  inst->src[0].file == GRF &&
1593	  !inst->predicated) {
1594	 last_mrf_move[inst->dst.hw_reg] = inst;
1595      }
1596   }
1597
1598   return progress;
1599}
1600
1601bool
1602fs_visitor::virtual_grf_interferes(int a, int b)
1603{
1604   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
1605   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
1606
1607   /* We can't handle dead register writes here, without iterating
1608    * over the whole instruction stream to find every single dead
1609    * write to that register to compare to the live interval of the
1610    * other register.  Just assert that dead_code_eliminate() has been
1611    * called.
1612    */
1613   assert((this->virtual_grf_use[a] != -1 ||
1614	   this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
1615	  (this->virtual_grf_use[b] != -1 ||
1616	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
1617
1618   /* If the register is used to store 16 values of less than float
1619    * size (only the case for pixel_[xy]), then we can't allocate
1620    * another dword-sized thing to that register that would be used in
1621    * the same instruction.  This is because when the GPU decodes (for
1622    * example):
1623    *
1624    * (declare (in ) vec4 gl_FragCoord@0x97766a0)
1625    * add(16)         g6<1>F          g6<8,8,1>UW     0.5F { align1 compr };
1626    *
1627    * it's actually processed as:
1628    * add(8)         g6<1>F          g6<8,8,1>UW     0.5F { align1 };
1629    * add(8)         g7<1>F          g6.8<8,8,1>UW   0.5F { align1 sechalf };
1630    *
1631    * so our second half values in g6 got overwritten in the first
1632    * half.
1633    */
1634   if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
1635				   this->pixel_x.reg == b ||
1636				   this->pixel_y.reg == a ||
1637				   this->pixel_y.reg == b)) {
1638      return start <= end;
1639   }
1640
1641   return start < end;
1642}
1643
1644bool
1645fs_visitor::run()
1646{
1647   uint32_t prog_offset_16 = 0;
1648   uint32_t orig_nr_params = c->prog_data.nr_params;
1649
1650   brw_wm_payload_setup(brw, c);
1651
1652   if (c->dispatch_width == 16) {
1653      /* align to 64 byte boundary. */
1654      while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
1655	 brw_NOP(p);
1656      }
1657
1658      /* Save off the start of this 16-wide program in case we succeed. */
1659      prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
1660
1661      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1662   }
1663
1664   if (0) {
1665      emit_dummy_fs();
1666   } else {
1667      calculate_urb_setup();
1668      if (intel->gen < 6)
1669	 emit_interpolation_setup_gen4();
1670      else
1671	 emit_interpolation_setup_gen6();
1672
1673      /* Generate FS IR for main().  (the visitor only descends into
1674       * functions called "main").
1675       */
1676      foreach_list(node, &*shader->ir) {
1677	 ir_instruction *ir = (ir_instruction *)node;
1678	 base_ir = ir;
1679	 this->result = reg_undef;
1680	 ir->accept(this);
1681      }
1682      if (failed)
1683	 return false;
1684
1685      emit_fb_writes();
1686
1687      split_virtual_grfs();
1688
1689      setup_paramvalues_refs();
1690      setup_pull_constants();
1691
1692      bool progress;
1693      do {
1694	 progress = false;
1695
1696	 progress = remove_duplicate_mrf_writes() || progress;
1697
1698	 progress = propagate_constants() || progress;
1699	 progress = opt_algebraic() || progress;
1700	 progress = register_coalesce() || progress;
1701	 progress = compute_to_mrf() || progress;
1702	 progress = dead_code_eliminate() || progress;
1703      } while (progress);
1704
1705      remove_dead_constants();
1706
1707      schedule_instructions();
1708
1709      assign_curb_setup();
1710      assign_urb_setup();
1711
1712      if (0) {
1713	 /* Debug of register spilling: Go spill everything. */
1714	 int virtual_grf_count = virtual_grf_next;
1715	 for (int i = 0; i < virtual_grf_count; i++) {
1716	    spill_reg(i);
1717	 }
1718      }
1719
1720      if (0)
1721	 assign_regs_trivial();
1722      else {
1723	 while (!assign_regs()) {
1724	    if (failed)
1725	       break;
1726	 }
1727      }
1728   }
1729   assert(force_uncompressed_stack == 0);
1730   assert(force_sechalf_stack == 0);
1731
1732   if (failed)
1733      return false;
1734
1735   generate_code();
1736
1737   if (c->dispatch_width == 8) {
1738      c->prog_data.reg_blocks = brw_register_blocks(grf_used);
1739   } else {
1740      c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
1741      c->prog_data.prog_offset_16 = prog_offset_16;
1742
1743      /* Make sure we didn't try to sneak in an extra uniform */
1744      assert(orig_nr_params == c->prog_data.nr_params);
1745   }
1746
1747   return !failed;
1748}
1749
1750bool
1751brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
1752	       struct gl_shader_program *prog)
1753{
1754   struct intel_context *intel = &brw->intel;
1755
1756   if (!prog)
1757      return false;
1758
1759   struct brw_shader *shader =
1760     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
1761   if (!shader)
1762      return false;
1763
1764   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1765      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
1766      _mesa_print_ir(shader->ir, NULL);
1767      printf("\n\n");
1768   }
1769
1770   /* Now the main event: Visit the shader IR and generate our FS IR for it.
1771    */
1772   c->dispatch_width = 8;
1773
1774   fs_visitor v(c, prog, shader);
1775   if (!v.run()) {
1776      prog->LinkStatus = GL_FALSE;
1777      prog->InfoLog = ralloc_strdup(prog, v.fail_msg);
1778
1779      return false;
1780   }
1781
1782   if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
1783      c->dispatch_width = 16;
1784      fs_visitor v2(c, prog, shader);
1785      v2.import_uniforms(&v);
1786      v2.run();
1787   }
1788
1789   c->prog_data.dispatch_width = 8;
1790
1791   return true;
1792}
1793
1794bool
1795brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
1796{
1797   struct brw_context *brw = brw_context(ctx);
1798   struct brw_wm_prog_key key;
1799   struct gl_fragment_program *fp = prog->FragmentProgram;
1800   struct brw_fragment_program *bfp = brw_fragment_program(fp);
1801
1802   if (!fp)
1803      return true;
1804
1805   memset(&key, 0, sizeof(key));
1806
1807   if (fp->UsesKill)
1808      key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
1809
1810   if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
1811      key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
1812
1813   /* Just assume depth testing. */
1814   key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
1815   key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
1816
1817   key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
1818   for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1819      int vp_index = -1;
1820
1821      if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
1822	 continue;
1823
1824      key.proj_attrib_mask |= 1 << i;
1825
1826      if (i <= FRAG_ATTRIB_TEX7)
1827	 vp_index = i;
1828      else if (i >= FRAG_ATTRIB_VAR0)
1829	 vp_index = i - FRAG_ATTRIB_VAR0 + VERT_RESULT_VAR0;
1830
1831      if (vp_index >= 0)
1832	 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
1833   }
1834
1835   key.clamp_fragment_color = true;
1836
1837   for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) {
1838      if (fp->Base.ShadowSamplers & (1 << i))
1839	 key.compare_funcs[i] = GL_LESS;
1840
1841      /* FINISHME: depth compares might use (0,0,0,W) for example */
1842      key.tex_swizzles[i] = SWIZZLE_XYZW;
1843   }
1844
1845   if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
1846      key.drawable_height = ctx->DrawBuffer->Height;
1847      key.render_to_fbo = ctx->DrawBuffer->Name != 0;
1848   }
1849
1850   key.nr_color_regions = 1;
1851
1852   key.program_string_id = bfp->id;
1853
1854   uint32_t old_prog_offset = brw->wm.prog_offset;
1855   struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
1856
1857   bool success = do_wm_prog(brw, prog, bfp, &key);
1858
1859   brw->wm.prog_offset = old_prog_offset;
1860   brw->wm.prog_data = old_prog_data;
1861
1862   return success;
1863}
1864