brw_fs.cpp revision cf0e7aa9f8bc9c175ebd9b2ab3a8bfec4afc5abf
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31extern "C" {
32
33#include <sys/types.h>
34
35#include "main/macros.h"
36#include "main/shaderobj.h"
37#include "main/uniforms.h"
38#include "main/fbobject.h"
39#include "program/prog_parameter.h"
40#include "program/prog_print.h"
41#include "program/register_allocate.h"
42#include "program/sampler.h"
43#include "program/hash_table.h"
44#include "brw_context.h"
45#include "brw_eu.h"
46#include "brw_wm.h"
47}
48#include "brw_shader.h"
49#include "brw_fs.h"
50#include "glsl/glsl_types.h"
51#include "glsl/ir_print_visitor.h"
52
53int
54fs_visitor::type_size(const struct glsl_type *type)
55{
56   unsigned int size, i;
57
58   switch (type->base_type) {
59   case GLSL_TYPE_UINT:
60   case GLSL_TYPE_INT:
61   case GLSL_TYPE_FLOAT:
62   case GLSL_TYPE_BOOL:
63      return type->components();
64   case GLSL_TYPE_ARRAY:
65      return type_size(type->fields.array) * type->length;
66   case GLSL_TYPE_STRUCT:
67      size = 0;
68      for (i = 0; i < type->length; i++) {
69	 size += type_size(type->fields.structure[i].type);
70      }
71      return size;
72   case GLSL_TYPE_SAMPLER:
73      /* Samplers take up no register space, since they're baked in at
74       * link time.
75       */
76      return 0;
77   default:
78      assert(!"not reached");
79      return 0;
80   }
81}
82
83void
84fs_visitor::fail(const char *format, ...)
85{
86   va_list va;
87   char *msg;
88
89   if (failed)
90      return;
91
92   failed = true;
93
94   va_start(va, format);
95   msg = ralloc_vasprintf(mem_ctx, format, va);
96   va_end(va);
97   msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
98
99   this->fail_msg = msg;
100
101   if (INTEL_DEBUG & DEBUG_WM) {
102      fprintf(stderr, "%s",  msg);
103   }
104}
105
106void
107fs_visitor::push_force_uncompressed()
108{
109   force_uncompressed_stack++;
110}
111
112void
113fs_visitor::pop_force_uncompressed()
114{
115   force_uncompressed_stack--;
116   assert(force_uncompressed_stack >= 0);
117}
118
119void
120fs_visitor::push_force_sechalf()
121{
122   force_sechalf_stack++;
123}
124
125void
126fs_visitor::pop_force_sechalf()
127{
128   force_sechalf_stack--;
129   assert(force_sechalf_stack >= 0);
130}
131
132/**
133 * Returns how many MRFs an FS opcode will write over.
134 *
135 * Note that this is not the 0 or 1 implied writes in an actual gen
136 * instruction -- the FS opcodes often generate MOVs in addition.
137 */
138int
139fs_visitor::implied_mrf_writes(fs_inst *inst)
140{
141   if (inst->mlen == 0)
142      return 0;
143
144   switch (inst->opcode) {
145   case SHADER_OPCODE_RCP:
146   case SHADER_OPCODE_RSQ:
147   case SHADER_OPCODE_SQRT:
148   case SHADER_OPCODE_EXP2:
149   case SHADER_OPCODE_LOG2:
150   case SHADER_OPCODE_SIN:
151   case SHADER_OPCODE_COS:
152      return 1 * c->dispatch_width / 8;
153   case SHADER_OPCODE_POW:
154   case SHADER_OPCODE_INT_QUOTIENT:
155   case SHADER_OPCODE_INT_REMAINDER:
156      return 2 * c->dispatch_width / 8;
157   case SHADER_OPCODE_TEX:
158   case FS_OPCODE_TXB:
159   case SHADER_OPCODE_TXD:
160   case SHADER_OPCODE_TXF:
161   case SHADER_OPCODE_TXL:
162   case SHADER_OPCODE_TXS:
163      return 1;
164   case FS_OPCODE_FB_WRITE:
165      return 2;
166   case FS_OPCODE_PULL_CONSTANT_LOAD:
167   case FS_OPCODE_UNSPILL:
168      return 1;
169   case FS_OPCODE_SPILL:
170      return 2;
171   default:
172      assert(!"not reached");
173      return inst->mlen;
174   }
175}
176
177int
178fs_visitor::virtual_grf_alloc(int size)
179{
180   if (virtual_grf_array_size <= virtual_grf_next) {
181      if (virtual_grf_array_size == 0)
182	 virtual_grf_array_size = 16;
183      else
184	 virtual_grf_array_size *= 2;
185      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
186				   virtual_grf_array_size);
187   }
188   virtual_grf_sizes[virtual_grf_next] = size;
189   return virtual_grf_next++;
190}
191
192/** Fixed HW reg constructor. */
193fs_reg::fs_reg(enum register_file file, int reg)
194{
195   init();
196   this->file = file;
197   this->reg = reg;
198   this->type = BRW_REGISTER_TYPE_F;
199}
200
201/** Fixed HW reg constructor. */
202fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
203{
204   init();
205   this->file = file;
206   this->reg = reg;
207   this->type = type;
208}
209
210/** Automatic reg constructor. */
211fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
212{
213   init();
214
215   this->file = GRF;
216   this->reg = v->virtual_grf_alloc(v->type_size(type));
217   this->reg_offset = 0;
218   this->type = brw_type_for_base_type(type);
219}
220
221fs_reg *
222fs_visitor::variable_storage(ir_variable *var)
223{
224   return (fs_reg *)hash_table_find(this->variable_ht, var);
225}
226
227void
228import_uniforms_callback(const void *key,
229			 void *data,
230			 void *closure)
231{
232   struct hash_table *dst_ht = (struct hash_table *)closure;
233   const fs_reg *reg = (const fs_reg *)data;
234
235   if (reg->file != UNIFORM)
236      return;
237
238   hash_table_insert(dst_ht, data, key);
239}
240
241/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
242 * This brings in those uniform definitions
243 */
244void
245fs_visitor::import_uniforms(fs_visitor *v)
246{
247   hash_table_call_foreach(v->variable_ht,
248			   import_uniforms_callback,
249			   variable_ht);
250   this->params_remap = v->params_remap;
251}
252
253/* Our support for uniforms is piggy-backed on the struct
254 * gl_fragment_program, because that's where the values actually
255 * get stored, rather than in some global gl_shader_program uniform
256 * store.
257 */
258int
259fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
260{
261   unsigned int offset = 0;
262
263   if (type->is_matrix()) {
264      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
265							type->vector_elements,
266							1);
267
268      for (unsigned int i = 0; i < type->matrix_columns; i++) {
269	 offset += setup_uniform_values(loc + offset, column);
270      }
271
272      return offset;
273   }
274
275   switch (type->base_type) {
276   case GLSL_TYPE_FLOAT:
277   case GLSL_TYPE_UINT:
278   case GLSL_TYPE_INT:
279   case GLSL_TYPE_BOOL:
280      for (unsigned int i = 0; i < type->vector_elements; i++) {
281	 unsigned int param = c->prog_data.nr_params++;
282
283	 assert(param < ARRAY_SIZE(c->prog_data.param));
284
285	 if (ctx->Const.NativeIntegers) {
286	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
287	 } else {
288	    switch (type->base_type) {
289	    case GLSL_TYPE_FLOAT:
290	       c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
291	       break;
292	    case GLSL_TYPE_UINT:
293	       c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
294	       break;
295	    case GLSL_TYPE_INT:
296	       c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
297	       break;
298	    case GLSL_TYPE_BOOL:
299	       c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
300	       break;
301	    default:
302	       assert(!"not reached");
303	       c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
304	       break;
305	    }
306	 }
307	 this->param_index[param] = loc;
308	 this->param_offset[param] = i;
309      }
310      return 1;
311
312   case GLSL_TYPE_STRUCT:
313      for (unsigned int i = 0; i < type->length; i++) {
314	 offset += setup_uniform_values(loc + offset,
315					type->fields.structure[i].type);
316      }
317      return offset;
318
319   case GLSL_TYPE_ARRAY:
320      for (unsigned int i = 0; i < type->length; i++) {
321	 offset += setup_uniform_values(loc + offset, type->fields.array);
322      }
323      return offset;
324
325   case GLSL_TYPE_SAMPLER:
326      /* The sampler takes up a slot, but we don't use any values from it. */
327      return 1;
328
329   default:
330      assert(!"not reached");
331      return 0;
332   }
333}
334
335
336/* Our support for builtin uniforms is even scarier than non-builtin.
337 * It sits on top of the PROG_STATE_VAR parameters that are
338 * automatically updated from GL context state.
339 */
340void
341fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
342{
343   const ir_state_slot *const slots = ir->state_slots;
344   assert(ir->state_slots != NULL);
345
346   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
347      /* This state reference has already been setup by ir_to_mesa, but we'll
348       * get the same index back here.
349       */
350      int index = _mesa_add_state_reference(this->fp->Base.Parameters,
351					    (gl_state_index *)slots[i].tokens);
352
353      /* Add each of the unique swizzles of the element as a parameter.
354       * This'll end up matching the expected layout of the
355       * array/matrix/structure we're trying to fill in.
356       */
357      int last_swiz = -1;
358      for (unsigned int j = 0; j < 4; j++) {
359	 int swiz = GET_SWZ(slots[i].swizzle, j);
360	 if (swiz == last_swiz)
361	    break;
362	 last_swiz = swiz;
363
364	 c->prog_data.param_convert[c->prog_data.nr_params] =
365	    PARAM_NO_CONVERT;
366	 this->param_index[c->prog_data.nr_params] = index;
367	 this->param_offset[c->prog_data.nr_params] = swiz;
368	 c->prog_data.nr_params++;
369      }
370   }
371}
372
373fs_reg *
374fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
375{
376   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
377   fs_reg wpos = *reg;
378   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
379
380   /* gl_FragCoord.x */
381   if (ir->pixel_center_integer) {
382      emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
383   } else {
384      emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
385   }
386   wpos.reg_offset++;
387
388   /* gl_FragCoord.y */
389   if (!flip && ir->pixel_center_integer) {
390      emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
391   } else {
392      fs_reg pixel_y = this->pixel_y;
393      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
394
395      if (flip) {
396	 pixel_y.negate = true;
397	 offset += c->key.drawable_height - 1.0;
398      }
399
400      emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
401   }
402   wpos.reg_offset++;
403
404   /* gl_FragCoord.z */
405   if (intel->gen >= 6) {
406      emit(BRW_OPCODE_MOV, wpos,
407	   fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
408   } else {
409      emit(FS_OPCODE_LINTERP, wpos,
410           this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
411           this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
412           interp_reg(FRAG_ATTRIB_WPOS, 2));
413   }
414   wpos.reg_offset++;
415
416   /* gl_FragCoord.w: Already set up in emit_interpolation */
417   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
418
419   return reg;
420}
421
422fs_inst *
423fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
424                         glsl_interp_qualifier interpolation_mode)
425{
426   brw_wm_barycentric_interp_mode barycoord_mode;
427   if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
428      barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
429   else
430      barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
431   return emit(FS_OPCODE_LINTERP, attr,
432               this->delta_x[barycoord_mode],
433               this->delta_y[barycoord_mode], interp);
434}
435
436fs_reg *
437fs_visitor::emit_general_interpolation(ir_variable *ir)
438{
439   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
440   reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
441   fs_reg attr = *reg;
442
443   unsigned int array_elements;
444   const glsl_type *type;
445
446   if (ir->type->is_array()) {
447      array_elements = ir->type->length;
448      if (array_elements == 0) {
449	 fail("dereferenced array '%s' has length 0\n", ir->name);
450      }
451      type = ir->type->fields.array;
452   } else {
453      array_elements = 1;
454      type = ir->type;
455   }
456
457   glsl_interp_qualifier interpolation_mode =
458      ir->determine_interpolation_mode(c->key.flat_shade);
459
460   int location = ir->location;
461   for (unsigned int i = 0; i < array_elements; i++) {
462      for (unsigned int j = 0; j < type->matrix_columns; j++) {
463	 if (urb_setup[location] == -1) {
464	    /* If there's no incoming setup data for this slot, don't
465	     * emit interpolation for it.
466	     */
467	    attr.reg_offset += type->vector_elements;
468	    location++;
469	    continue;
470	 }
471
472	 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
473	    /* Constant interpolation (flat shading) case. The SF has
474	     * handed us defined values in only the constant offset
475	     * field of the setup reg.
476	     */
477	    for (unsigned int k = 0; k < type->vector_elements; k++) {
478	       struct brw_reg interp = interp_reg(location, k);
479	       interp = suboffset(interp, 3);
480               interp.type = reg->type;
481	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
482	       attr.reg_offset++;
483	    }
484	 } else {
485	    /* Smooth/noperspective interpolation case. */
486	    for (unsigned int k = 0; k < type->vector_elements; k++) {
487	       /* FINISHME: At some point we probably want to push
488		* this farther by giving similar treatment to the
489		* other potentially constant components of the
490		* attribute, as well as making brw_vs_constval.c
491		* handle varyings other than gl_TexCoord.
492		*/
493	       if (location >= FRAG_ATTRIB_TEX0 &&
494		   location <= FRAG_ATTRIB_TEX7 &&
495		   k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
496		  emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
497	       } else {
498		  struct brw_reg interp = interp_reg(location, k);
499                  emit_linterp(attr, fs_reg(interp), interpolation_mode);
500		  if (intel->gen < 6) {
501		     emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
502		  }
503	       }
504	       attr.reg_offset++;
505	    }
506
507	 }
508	 location++;
509      }
510   }
511
512   return reg;
513}
514
515fs_reg *
516fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
517{
518   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
519
520   /* The frontfacing comes in as a bit in the thread payload. */
521   if (intel->gen >= 6) {
522      emit(BRW_OPCODE_ASR, *reg,
523	   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
524	   fs_reg(15));
525      emit(BRW_OPCODE_NOT, *reg, *reg);
526      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
527   } else {
528      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
529      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
530       * us front face
531       */
532      fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
533			   fs_reg(r1_6ud),
534			   fs_reg(1u << 31));
535      inst->conditional_mod = BRW_CONDITIONAL_L;
536      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
537   }
538
539   return reg;
540}
541
542fs_inst *
543fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
544{
545   switch (opcode) {
546   case SHADER_OPCODE_RCP:
547   case SHADER_OPCODE_RSQ:
548   case SHADER_OPCODE_SQRT:
549   case SHADER_OPCODE_EXP2:
550   case SHADER_OPCODE_LOG2:
551   case SHADER_OPCODE_SIN:
552   case SHADER_OPCODE_COS:
553      break;
554   default:
555      assert(!"not reached: bad math opcode");
556      return NULL;
557   }
558
559   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
560    * might be able to do better by doing execsize = 1 math and then
561    * expanding that result out, but we would need to be careful with
562    * masking.
563    *
564    * Gen 6 hardware ignores source modifiers (negate and abs) on math
565    * instructions, so we also move to a temp to set those up.
566    */
567   if (intel->gen == 6 && (src.file == UNIFORM ||
568			   src.abs ||
569			   src.negate)) {
570      fs_reg expanded = fs_reg(this, glsl_type::float_type);
571      emit(BRW_OPCODE_MOV, expanded, src);
572      src = expanded;
573   }
574
575   fs_inst *inst = emit(opcode, dst, src);
576
577   if (intel->gen < 6) {
578      inst->base_mrf = 2;
579      inst->mlen = c->dispatch_width / 8;
580   }
581
582   return inst;
583}
584
585fs_inst *
586fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
587{
588   int base_mrf = 2;
589   fs_inst *inst;
590
591   switch (opcode) {
592   case SHADER_OPCODE_POW:
593   case SHADER_OPCODE_INT_QUOTIENT:
594   case SHADER_OPCODE_INT_REMAINDER:
595      break;
596   default:
597      assert(!"not reached: unsupported binary math opcode.");
598      return NULL;
599   }
600
601   if (intel->gen >= 7) {
602      inst = emit(opcode, dst, src0, src1);
603   } else if (intel->gen == 6) {
604      /* Can't do hstride == 0 args to gen6 math, so expand it out.
605       *
606       * The hardware ignores source modifiers (negate and abs) on math
607       * instructions, so we also move to a temp to set those up.
608       */
609      if (src0.file == UNIFORM || src0.abs || src0.negate) {
610	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
611	 expanded.type = src0.type;
612	 emit(BRW_OPCODE_MOV, expanded, src0);
613	 src0 = expanded;
614      }
615
616      if (src1.file == UNIFORM || src1.abs || src1.negate) {
617	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
618	 expanded.type = src1.type;
619	 emit(BRW_OPCODE_MOV, expanded, src1);
620	 src1 = expanded;
621      }
622
623      inst = emit(opcode, dst, src0, src1);
624   } else {
625      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
626       * "Message Payload":
627       *
628       * "Operand0[7].  For the INT DIV functions, this operand is the
629       *  denominator."
630       *  ...
631       * "Operand1[7].  For the INT DIV functions, this operand is the
632       *  numerator."
633       */
634      bool is_int_div = opcode != SHADER_OPCODE_POW;
635      fs_reg &op0 = is_int_div ? src1 : src0;
636      fs_reg &op1 = is_int_div ? src0 : src1;
637
638      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
639      inst = emit(opcode, dst, op0, reg_null_f);
640
641      inst->base_mrf = base_mrf;
642      inst->mlen = 2 * c->dispatch_width / 8;
643   }
644   return inst;
645}
646
647/**
648 * To be called after the last _mesa_add_state_reference() call, to
649 * set up prog_data.param[] for assign_curb_setup() and
650 * setup_pull_constants().
651 */
652void
653fs_visitor::setup_paramvalues_refs()
654{
655   if (c->dispatch_width != 8)
656      return;
657
658   /* Set up the pointers to ParamValues now that that array is finalized. */
659   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
660      c->prog_data.param[i] =
661	 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
662	 this->param_offset[i];
663   }
664}
665
666void
667fs_visitor::assign_curb_setup()
668{
669   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
670   if (c->dispatch_width == 8) {
671      c->prog_data.first_curbe_grf = c->nr_payload_regs;
672   } else {
673      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
674   }
675
676   /* Map the offsets in the UNIFORM file to fixed HW regs. */
677   foreach_list(node, &this->instructions) {
678      fs_inst *inst = (fs_inst *)node;
679
680      for (unsigned int i = 0; i < 3; i++) {
681	 if (inst->src[i].file == UNIFORM) {
682	    int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
683	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
684						  constant_nr / 8,
685						  constant_nr % 8);
686
687	    inst->src[i].file = FIXED_HW_REG;
688	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
689	 }
690      }
691   }
692}
693
694void
695fs_visitor::calculate_urb_setup()
696{
697   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
698      urb_setup[i] = -1;
699   }
700
701   int urb_next = 0;
702   /* Figure out where each of the incoming setup attributes lands. */
703   if (intel->gen >= 6) {
704      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
705	 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
706	    urb_setup[i] = urb_next++;
707	 }
708      }
709   } else {
710      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
711      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
712	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
713	    int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
714
715	    if (fp_index >= 0)
716	       urb_setup[fp_index] = urb_next++;
717	 }
718      }
719
720      /*
721       * It's a FS only attribute, and we did interpolation for this attribute
722       * in SF thread. So, count it here, too.
723       *
724       * See compile_sf_prog() for more info.
725       */
726      if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
727         urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
728   }
729
730   /* Each attribute is 4 setup channels, each of which is half a reg. */
731   c->prog_data.urb_read_length = urb_next * 2;
732}
733
734void
735fs_visitor::assign_urb_setup()
736{
737   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
738
739   /* Offset all the urb_setup[] index by the actual position of the
740    * setup regs, now that the location of the constants has been chosen.
741    */
742   foreach_list(node, &this->instructions) {
743      fs_inst *inst = (fs_inst *)node;
744
745      if (inst->opcode == FS_OPCODE_LINTERP) {
746	 assert(inst->src[2].file == FIXED_HW_REG);
747	 inst->src[2].fixed_hw_reg.nr += urb_start;
748      }
749
750      if (inst->opcode == FS_OPCODE_CINTERP) {
751	 assert(inst->src[0].file == FIXED_HW_REG);
752	 inst->src[0].fixed_hw_reg.nr += urb_start;
753      }
754   }
755
756   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
757}
758
759/**
760 * Split large virtual GRFs into separate components if we can.
761 *
762 * This is mostly duplicated with what brw_fs_vector_splitting does,
763 * but that's really conservative because it's afraid of doing
764 * splitting that doesn't result in real progress after the rest of
765 * the optimization phases, which would cause infinite looping in
766 * optimization.  We can do it once here, safely.  This also has the
767 * opportunity to split interpolated values, or maybe even uniforms,
768 * which we don't have at the IR level.
769 *
770 * We want to split, because virtual GRFs are what we register
771 * allocate and spill (due to contiguousness requirements for some
772 * instructions), and they're what we naturally generate in the
773 * codegen process, but most virtual GRFs don't actually need to be
774 * contiguous sets of GRFs.  If we split, we'll end up with reduced
775 * live intervals and better dead code elimination and coalescing.
776 */
777void
778fs_visitor::split_virtual_grfs()
779{
780   int num_vars = this->virtual_grf_next;
781   bool split_grf[num_vars];
782   int new_virtual_grf[num_vars];
783
784   /* Try to split anything > 0 sized. */
785   for (int i = 0; i < num_vars; i++) {
786      if (this->virtual_grf_sizes[i] != 1)
787	 split_grf[i] = true;
788      else
789	 split_grf[i] = false;
790   }
791
792   if (brw->has_pln &&
793       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
794      /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
795       * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
796       * Gen6, that was the only supported interpolation mode, and since Gen6,
797       * delta_x and delta_y are in fixed hardware registers.
798       */
799      split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
800         false;
801   }
802
803   foreach_list(node, &this->instructions) {
804      fs_inst *inst = (fs_inst *)node;
805
806      /* Texturing produces 4 contiguous registers, so no splitting. */
807      if (inst->is_tex()) {
808	 split_grf[inst->dst.reg] = false;
809      }
810   }
811
812   /* Allocate new space for split regs.  Note that the virtual
813    * numbers will be contiguous.
814    */
815   for (int i = 0; i < num_vars; i++) {
816      if (split_grf[i]) {
817	 new_virtual_grf[i] = virtual_grf_alloc(1);
818	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
819	    int reg = virtual_grf_alloc(1);
820	    assert(reg == new_virtual_grf[i] + j - 1);
821	    (void) reg;
822	 }
823	 this->virtual_grf_sizes[i] = 1;
824      }
825   }
826
827   foreach_list(node, &this->instructions) {
828      fs_inst *inst = (fs_inst *)node;
829
830      if (inst->dst.file == GRF &&
831	  split_grf[inst->dst.reg] &&
832	  inst->dst.reg_offset != 0) {
833	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
834			  inst->dst.reg_offset - 1);
835	 inst->dst.reg_offset = 0;
836      }
837      for (int i = 0; i < 3; i++) {
838	 if (inst->src[i].file == GRF &&
839	     split_grf[inst->src[i].reg] &&
840	     inst->src[i].reg_offset != 0) {
841	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
842				inst->src[i].reg_offset - 1);
843	    inst->src[i].reg_offset = 0;
844	 }
845      }
846   }
847   this->live_intervals_valid = false;
848}
849
850bool
851fs_visitor::remove_dead_constants()
852{
853   if (c->dispatch_width == 8) {
854      this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
855
856      for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
857	 this->params_remap[i] = -1;
858
859      /* Find which params are still in use. */
860      foreach_list(node, &this->instructions) {
861	 fs_inst *inst = (fs_inst *)node;
862
863	 for (int i = 0; i < 3; i++) {
864	    int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
865
866	    if (inst->src[i].file != UNIFORM)
867	       continue;
868
869	    assert(constant_nr < (int)c->prog_data.nr_params);
870
871	    /* For now, set this to non-negative.  We'll give it the
872	     * actual new number in a moment, in order to keep the
873	     * register numbers nicely ordered.
874	     */
875	    this->params_remap[constant_nr] = 0;
876	 }
877      }
878
879      /* Figure out what the new numbers for the params will be.  At some
880       * point when we're doing uniform array access, we're going to want
881       * to keep the distinction between .reg and .reg_offset, but for
882       * now we don't care.
883       */
884      unsigned int new_nr_params = 0;
885      for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
886	 if (this->params_remap[i] != -1) {
887	    this->params_remap[i] = new_nr_params++;
888	 }
889      }
890
891      /* Update the list of params to be uploaded to match our new numbering. */
892      for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
893	 int remapped = this->params_remap[i];
894
895	 if (remapped == -1)
896	    continue;
897
898	 /* We've already done setup_paramvalues_refs() so no need to worry
899	  * about param_index and param_offset.
900	  */
901	 c->prog_data.param[remapped] = c->prog_data.param[i];
902	 c->prog_data.param_convert[remapped] = c->prog_data.param_convert[i];
903      }
904
905      c->prog_data.nr_params = new_nr_params;
906   } else {
907      /* This should have been generated in the 8-wide pass already. */
908      assert(this->params_remap);
909   }
910
911   /* Now do the renumbering of the shader to remove unused params. */
912   foreach_list(node, &this->instructions) {
913      fs_inst *inst = (fs_inst *)node;
914
915      for (int i = 0; i < 3; i++) {
916	 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
917
918	 if (inst->src[i].file != UNIFORM)
919	    continue;
920
921	 assert(this->params_remap[constant_nr] != -1);
922	 inst->src[i].reg = this->params_remap[constant_nr];
923	 inst->src[i].reg_offset = 0;
924      }
925   }
926
927   return true;
928}
929
930/**
931 * Choose accesses from the UNIFORM file to demote to using the pull
932 * constant buffer.
933 *
934 * We allow a fragment shader to have more than the specified minimum
935 * maximum number of fragment shader uniform components (64).  If
936 * there are too many of these, they'd fill up all of register space.
937 * So, this will push some of them out to the pull constant buffer and
938 * update the program to load them.
939 */
940void
941fs_visitor::setup_pull_constants()
942{
943   /* Only allow 16 registers (128 uniform components) as push constants. */
944   unsigned int max_uniform_components = 16 * 8;
945   if (c->prog_data.nr_params <= max_uniform_components)
946      return;
947
948   if (c->dispatch_width == 16) {
949      fail("Pull constants not supported in 16-wide\n");
950      return;
951   }
952
953   /* Just demote the end of the list.  We could probably do better
954    * here, demoting things that are rarely used in the program first.
955    */
956   int pull_uniform_base = max_uniform_components;
957   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
958
959   foreach_list(node, &this->instructions) {
960      fs_inst *inst = (fs_inst *)node;
961
962      for (int i = 0; i < 3; i++) {
963	 if (inst->src[i].file != UNIFORM)
964	    continue;
965
966	 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
967	 if (uniform_nr < pull_uniform_base)
968	    continue;
969
970	 fs_reg dst = fs_reg(this, glsl_type::float_type);
971	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
972					      dst);
973	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
974	 pull->ir = inst->ir;
975	 pull->annotation = inst->annotation;
976	 pull->base_mrf = 14;
977	 pull->mlen = 1;
978
979	 inst->insert_before(pull);
980
981	 inst->src[i].file = GRF;
982	 inst->src[i].reg = dst.reg;
983	 inst->src[i].reg_offset = 0;
984	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
985      }
986   }
987
988   for (int i = 0; i < pull_uniform_count; i++) {
989      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
990      c->prog_data.pull_param_convert[i] =
991	 c->prog_data.param_convert[pull_uniform_base + i];
992   }
993   c->prog_data.nr_params -= pull_uniform_count;
994   c->prog_data.nr_pull_params = pull_uniform_count;
995}
996
997/**
998 * Attempts to move immediate constants into the immediate
999 * constant slot of following instructions.
1000 *
1001 * Immediate constants are a bit tricky -- they have to be in the last
1002 * operand slot, you can't do abs/negate on them,
1003 */
1004
1005bool
1006fs_visitor::propagate_constants()
1007{
1008   bool progress = false;
1009
1010   calculate_live_intervals();
1011
1012   foreach_list(node, &this->instructions) {
1013      fs_inst *inst = (fs_inst *)node;
1014
1015      if (inst->opcode != BRW_OPCODE_MOV ||
1016	  inst->predicated ||
1017	  inst->dst.file != GRF || inst->src[0].file != IMM ||
1018	  inst->dst.type != inst->src[0].type ||
1019	  (c->dispatch_width == 16 &&
1020	   (inst->force_uncompressed || inst->force_sechalf)))
1021	 continue;
1022
1023      /* Don't bother with cases where we should have had the
1024       * operation on the constant folded in GLSL already.
1025       */
1026      if (inst->saturate)
1027	 continue;
1028
1029      /* Found a move of a constant to a GRF.  Find anything else using the GRF
1030       * before it's written, and replace it with the constant if we can.
1031       */
1032      for (fs_inst *scan_inst = (fs_inst *)inst->next;
1033	   !scan_inst->is_tail_sentinel();
1034	   scan_inst = (fs_inst *)scan_inst->next) {
1035	 if (scan_inst->opcode == BRW_OPCODE_DO ||
1036	     scan_inst->opcode == BRW_OPCODE_WHILE ||
1037	     scan_inst->opcode == BRW_OPCODE_ELSE ||
1038	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
1039	    break;
1040	 }
1041
1042	 for (int i = 2; i >= 0; i--) {
1043	    if (scan_inst->src[i].file != GRF ||
1044		scan_inst->src[i].reg != inst->dst.reg ||
1045		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
1046	       continue;
1047
1048	    /* Don't bother with cases where we should have had the
1049	     * operation on the constant folded in GLSL already.
1050	     */
1051	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
1052	       continue;
1053
1054	    switch (scan_inst->opcode) {
1055	    case BRW_OPCODE_MOV:
1056	       scan_inst->src[i] = inst->src[0];
1057	       progress = true;
1058	       break;
1059
1060	    case BRW_OPCODE_MUL:
1061	    case BRW_OPCODE_ADD:
1062	       if (i == 1) {
1063		  scan_inst->src[i] = inst->src[0];
1064		  progress = true;
1065	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
1066		  /* Fit this constant in by commuting the operands.
1067		   * Exception: we can't do this for 32-bit integer MUL
1068		   * because it's asymmetric.
1069		   */
1070		  if (scan_inst->opcode == BRW_OPCODE_MUL &&
1071		      (scan_inst->src[1].type == BRW_REGISTER_TYPE_D ||
1072		       scan_inst->src[1].type == BRW_REGISTER_TYPE_UD))
1073		     break;
1074		  scan_inst->src[0] = scan_inst->src[1];
1075		  scan_inst->src[1] = inst->src[0];
1076		  progress = true;
1077	       }
1078	       break;
1079
1080	    case BRW_OPCODE_CMP:
1081	    case BRW_OPCODE_IF:
1082	       if (i == 1) {
1083		  scan_inst->src[i] = inst->src[0];
1084		  progress = true;
1085	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
1086		  uint32_t new_cmod;
1087
1088		  new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
1089		  if (new_cmod != ~0u) {
1090		     /* Fit this constant in by swapping the operands and
1091		      * flipping the test
1092		      */
1093		     scan_inst->src[0] = scan_inst->src[1];
1094		     scan_inst->src[1] = inst->src[0];
1095		     scan_inst->conditional_mod = new_cmod;
1096		     progress = true;
1097		  }
1098	       }
1099	       break;
1100
1101	    case BRW_OPCODE_SEL:
1102	       if (i == 1) {
1103		  scan_inst->src[i] = inst->src[0];
1104		  progress = true;
1105	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
1106		  scan_inst->src[0] = scan_inst->src[1];
1107		  scan_inst->src[1] = inst->src[0];
1108
1109		  /* If this was predicated, flipping operands means
1110		   * we also need to flip the predicate.
1111		   */
1112		  if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) {
1113		     scan_inst->predicate_inverse =
1114			!scan_inst->predicate_inverse;
1115		  }
1116		  progress = true;
1117	       }
1118	       break;
1119
1120	    case SHADER_OPCODE_RCP:
1121	       /* The hardware doesn't do math on immediate values
1122		* (because why are you doing that, seriously?), but
1123		* the correct answer is to just constant fold it
1124		* anyway.
1125		*/
1126	       assert(i == 0);
1127	       if (inst->src[0].imm.f != 0.0f) {
1128		  scan_inst->opcode = BRW_OPCODE_MOV;
1129		  scan_inst->src[0] = inst->src[0];
1130		  scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f;
1131		  progress = true;
1132	       }
1133	       break;
1134
1135	    default:
1136	       break;
1137	    }
1138	 }
1139
1140	 if (scan_inst->dst.file == GRF &&
1141	     scan_inst->dst.reg == inst->dst.reg &&
1142	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1143	      scan_inst->is_tex())) {
1144	    break;
1145	 }
1146      }
1147   }
1148
1149   if (progress)
1150       this->live_intervals_valid = false;
1151
1152   return progress;
1153}
1154
1155
1156/**
1157 * Attempts to move immediate constants into the immediate
1158 * constant slot of following instructions.
1159 *
1160 * Immediate constants are a bit tricky -- they have to be in the last
1161 * operand slot, you can't do abs/negate on them,
1162 */
1163
1164bool
1165fs_visitor::opt_algebraic()
1166{
1167   bool progress = false;
1168
1169   calculate_live_intervals();
1170
1171   foreach_list(node, &this->instructions) {
1172      fs_inst *inst = (fs_inst *)node;
1173
1174      switch (inst->opcode) {
1175      case BRW_OPCODE_MUL:
1176	 if (inst->src[1].file != IMM)
1177	    continue;
1178
1179	 /* a * 1.0 = a */
1180	 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1181	     inst->src[1].imm.f == 1.0) {
1182	    inst->opcode = BRW_OPCODE_MOV;
1183	    inst->src[1] = reg_undef;
1184	    progress = true;
1185	    break;
1186	 }
1187
1188	 break;
1189      default:
1190	 break;
1191      }
1192   }
1193
1194   return progress;
1195}
1196
1197/**
1198 * Must be called after calculate_live_intervales() to remove unused
1199 * writes to registers -- register allocation will fail otherwise
1200 * because something deffed but not used won't be considered to
1201 * interfere with other regs.
1202 */
1203bool
1204fs_visitor::dead_code_eliminate()
1205{
1206   bool progress = false;
1207   int pc = 0;
1208
1209   calculate_live_intervals();
1210
1211   foreach_list_safe(node, &this->instructions) {
1212      fs_inst *inst = (fs_inst *)node;
1213
1214      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1215	 inst->remove();
1216	 progress = true;
1217      }
1218
1219      pc++;
1220   }
1221
1222   if (progress)
1223      live_intervals_valid = false;
1224
1225   return progress;
1226}
1227
1228/**
1229 * Implements a second type of register coalescing: This one checks if
1230 * the two regs involved in a raw move don't interfere, in which case
1231 * they can both by stored in the same place and the MOV removed.
1232 */
1233bool
1234fs_visitor::register_coalesce_2()
1235{
1236   bool progress = false;
1237
1238   calculate_live_intervals();
1239
1240   foreach_list_safe(node, &this->instructions) {
1241      fs_inst *inst = (fs_inst *)node;
1242
1243      if (inst->opcode != BRW_OPCODE_MOV ||
1244	  inst->predicated ||
1245	  inst->saturate ||
1246	  inst->src[0].file != GRF ||
1247	  inst->src[0].negate ||
1248	  inst->src[0].abs ||
1249	  inst->src[0].smear != -1 ||
1250	  inst->dst.file != GRF ||
1251	  inst->dst.type != inst->src[0].type ||
1252	  virtual_grf_sizes[inst->src[0].reg] != 1 ||
1253	  virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1254	 continue;
1255      }
1256
1257      int reg_from = inst->src[0].reg;
1258      assert(inst->src[0].reg_offset == 0);
1259      int reg_to = inst->dst.reg;
1260      int reg_to_offset = inst->dst.reg_offset;
1261
1262      foreach_list_safe(node, &this->instructions) {
1263	 fs_inst *scan_inst = (fs_inst *)node;
1264
1265	 if (scan_inst->dst.file == GRF &&
1266	     scan_inst->dst.reg == reg_from) {
1267	    scan_inst->dst.reg = reg_to;
1268	    scan_inst->dst.reg_offset = reg_to_offset;
1269	 }
1270	 for (int i = 0; i < 3; i++) {
1271	    if (scan_inst->src[i].file == GRF &&
1272		scan_inst->src[i].reg == reg_from) {
1273	       scan_inst->src[i].reg = reg_to;
1274	       scan_inst->src[i].reg_offset = reg_to_offset;
1275	    }
1276	 }
1277      }
1278
1279      inst->remove();
1280      live_intervals_valid = false;
1281      progress = true;
1282      continue;
1283   }
1284
1285   return progress;
1286}
1287
1288bool
1289fs_visitor::register_coalesce()
1290{
1291   bool progress = false;
1292   int if_depth = 0;
1293   int loop_depth = 0;
1294
1295   foreach_list_safe(node, &this->instructions) {
1296      fs_inst *inst = (fs_inst *)node;
1297
1298      /* Make sure that we dominate the instructions we're going to
1299       * scan for interfering with our coalescing, or we won't have
1300       * scanned enough to see if anything interferes with our
1301       * coalescing.  We don't dominate the following instructions if
1302       * we're in a loop or an if block.
1303       */
1304      switch (inst->opcode) {
1305      case BRW_OPCODE_DO:
1306	 loop_depth++;
1307	 break;
1308      case BRW_OPCODE_WHILE:
1309	 loop_depth--;
1310	 break;
1311      case BRW_OPCODE_IF:
1312	 if_depth++;
1313	 break;
1314      case BRW_OPCODE_ENDIF:
1315	 if_depth--;
1316	 break;
1317      default:
1318	 break;
1319      }
1320      if (loop_depth || if_depth)
1321	 continue;
1322
1323      if (inst->opcode != BRW_OPCODE_MOV ||
1324	  inst->predicated ||
1325	  inst->saturate ||
1326	  inst->dst.file != GRF || (inst->src[0].file != GRF &&
1327				    inst->src[0].file != UNIFORM)||
1328	  inst->dst.type != inst->src[0].type)
1329	 continue;
1330
1331      bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1332
1333      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1334       * them: check for no writes to either one until the exit of the
1335       * program.
1336       */
1337      bool interfered = false;
1338
1339      for (fs_inst *scan_inst = (fs_inst *)inst->next;
1340	   !scan_inst->is_tail_sentinel();
1341	   scan_inst = (fs_inst *)scan_inst->next) {
1342	 if (scan_inst->dst.file == GRF) {
1343	    if (scan_inst->dst.reg == inst->dst.reg &&
1344		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1345		 scan_inst->is_tex())) {
1346	       interfered = true;
1347	       break;
1348	    }
1349	    if (inst->src[0].file == GRF &&
1350		scan_inst->dst.reg == inst->src[0].reg &&
1351		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
1352		 scan_inst->is_tex())) {
1353	       interfered = true;
1354	       break;
1355	    }
1356	 }
1357
1358	 /* The gen6 MATH instruction can't handle source modifiers or
1359	  * unusual register regions, so avoid coalescing those for
1360	  * now.  We should do something more specific.
1361	  */
1362	 if (intel->gen >= 6 &&
1363	     scan_inst->is_math() &&
1364	     (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1365	    interfered = true;
1366	    break;
1367	 }
1368
1369	 /* The accumulator result appears to get used for the
1370	  * conditional modifier generation.  When negating a UD
1371	  * value, there is a 33rd bit generated for the sign in the
1372	  * accumulator value, so now you can't check, for example,
1373	  * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1374	  */
1375	 if (scan_inst->conditional_mod &&
1376	     inst->src[0].negate &&
1377	     inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1378	    interfered = true;
1379	    break;
1380	 }
1381      }
1382      if (interfered) {
1383	 continue;
1384      }
1385
1386      /* Rewrite the later usage to point at the source of the move to
1387       * be removed.
1388       */
1389      for (fs_inst *scan_inst = inst;
1390	   !scan_inst->is_tail_sentinel();
1391	   scan_inst = (fs_inst *)scan_inst->next) {
1392	 for (int i = 0; i < 3; i++) {
1393	    if (scan_inst->src[i].file == GRF &&
1394		scan_inst->src[i].reg == inst->dst.reg &&
1395		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1396	       fs_reg new_src = inst->src[0];
1397               if (scan_inst->src[i].abs) {
1398                  new_src.negate = 0;
1399                  new_src.abs = 1;
1400               }
1401	       new_src.negate ^= scan_inst->src[i].negate;
1402	       scan_inst->src[i] = new_src;
1403	    }
1404	 }
1405      }
1406
1407      inst->remove();
1408      progress = true;
1409   }
1410
1411   if (progress)
1412      live_intervals_valid = false;
1413
1414   return progress;
1415}
1416
1417
1418bool
1419fs_visitor::compute_to_mrf()
1420{
1421   bool progress = false;
1422   int next_ip = 0;
1423
1424   calculate_live_intervals();
1425
1426   foreach_list_safe(node, &this->instructions) {
1427      fs_inst *inst = (fs_inst *)node;
1428
1429      int ip = next_ip;
1430      next_ip++;
1431
1432      if (inst->opcode != BRW_OPCODE_MOV ||
1433	  inst->predicated ||
1434	  inst->dst.file != MRF || inst->src[0].file != GRF ||
1435	  inst->dst.type != inst->src[0].type ||
1436	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1437	 continue;
1438
1439      /* Work out which hardware MRF registers are written by this
1440       * instruction.
1441       */
1442      int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1443      int mrf_high;
1444      if (inst->dst.reg & BRW_MRF_COMPR4) {
1445	 mrf_high = mrf_low + 4;
1446      } else if (c->dispatch_width == 16 &&
1447		 (!inst->force_uncompressed && !inst->force_sechalf)) {
1448	 mrf_high = mrf_low + 1;
1449      } else {
1450	 mrf_high = mrf_low;
1451      }
1452
1453      /* Can't compute-to-MRF this GRF if someone else was going to
1454       * read it later.
1455       */
1456      if (this->virtual_grf_use[inst->src[0].reg] > ip)
1457	 continue;
1458
1459      /* Found a move of a GRF to a MRF.  Let's see if we can go
1460       * rewrite the thing that made this GRF to write into the MRF.
1461       */
1462      fs_inst *scan_inst;
1463      for (scan_inst = (fs_inst *)inst->prev;
1464	   scan_inst->prev != NULL;
1465	   scan_inst = (fs_inst *)scan_inst->prev) {
1466	 if (scan_inst->dst.file == GRF &&
1467	     scan_inst->dst.reg == inst->src[0].reg) {
1468	    /* Found the last thing to write our reg we want to turn
1469	     * into a compute-to-MRF.
1470	     */
1471
1472	    if (scan_inst->is_tex()) {
1473	       /* texturing writes several continuous regs, so we can't
1474		* compute-to-mrf that.
1475		*/
1476	       break;
1477	    }
1478
1479	    /* If it's predicated, it (probably) didn't populate all
1480	     * the channels.  We might be able to rewrite everything
1481	     * that writes that reg, but it would require smarter
1482	     * tracking to delay the rewriting until complete success.
1483	     */
1484	    if (scan_inst->predicated)
1485	       break;
1486
1487	    /* If it's half of register setup and not the same half as
1488	     * our MOV we're trying to remove, bail for now.
1489	     */
1490	    if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1491		scan_inst->force_sechalf != inst->force_sechalf) {
1492	       break;
1493	    }
1494
1495	    /* SEND instructions can't have MRF as a destination. */
1496	    if (scan_inst->mlen)
1497	       break;
1498
1499	    if (intel->gen >= 6) {
1500	       /* gen6 math instructions must have the destination be
1501		* GRF, so no compute-to-MRF for them.
1502		*/
1503	       if (scan_inst->is_math()) {
1504		  break;
1505	       }
1506	    }
1507
1508	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1509	       /* Found the creator of our MRF's source value. */
1510	       scan_inst->dst.file = MRF;
1511	       scan_inst->dst.reg = inst->dst.reg;
1512	       scan_inst->saturate |= inst->saturate;
1513	       inst->remove();
1514	       progress = true;
1515	    }
1516	    break;
1517	 }
1518
1519	 /* We don't handle flow control here.  Most computation of
1520	  * values that end up in MRFs are shortly before the MRF
1521	  * write anyway.
1522	  */
1523	 if (scan_inst->opcode == BRW_OPCODE_DO ||
1524	     scan_inst->opcode == BRW_OPCODE_WHILE ||
1525	     scan_inst->opcode == BRW_OPCODE_ELSE ||
1526	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
1527	    break;
1528	 }
1529
1530	 /* You can't read from an MRF, so if someone else reads our
1531	  * MRF's source GRF that we wanted to rewrite, that stops us.
1532	  */
1533	 bool interfered = false;
1534	 for (int i = 0; i < 3; i++) {
1535	    if (scan_inst->src[i].file == GRF &&
1536		scan_inst->src[i].reg == inst->src[0].reg &&
1537		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1538	       interfered = true;
1539	    }
1540	 }
1541	 if (interfered)
1542	    break;
1543
1544	 if (scan_inst->dst.file == MRF) {
1545	    /* If somebody else writes our MRF here, we can't
1546	     * compute-to-MRF before that.
1547	     */
1548	    int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1549	    int scan_mrf_high;
1550
1551	    if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1552	       scan_mrf_high = scan_mrf_low + 4;
1553	    } else if (c->dispatch_width == 16 &&
1554		       (!scan_inst->force_uncompressed &&
1555			!scan_inst->force_sechalf)) {
1556	       scan_mrf_high = scan_mrf_low + 1;
1557	    } else {
1558	       scan_mrf_high = scan_mrf_low;
1559	    }
1560
1561	    if (mrf_low == scan_mrf_low ||
1562		mrf_low == scan_mrf_high ||
1563		mrf_high == scan_mrf_low ||
1564		mrf_high == scan_mrf_high) {
1565	       break;
1566	    }
1567	 }
1568
1569	 if (scan_inst->mlen > 0) {
1570	    /* Found a SEND instruction, which means that there are
1571	     * live values in MRFs from base_mrf to base_mrf +
1572	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
1573	     * above it.
1574	     */
1575	    if (mrf_low >= scan_inst->base_mrf &&
1576		mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1577	       break;
1578	    }
1579	    if (mrf_high >= scan_inst->base_mrf &&
1580		mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1581	       break;
1582	    }
1583	 }
1584      }
1585   }
1586
1587   return progress;
1588}
1589
1590/**
1591 * Walks through basic blocks, looking for repeated MRF writes and
1592 * removing the later ones.
1593 */
1594bool
1595fs_visitor::remove_duplicate_mrf_writes()
1596{
1597   fs_inst *last_mrf_move[16];
1598   bool progress = false;
1599
1600   /* Need to update the MRF tracking for compressed instructions. */
1601   if (c->dispatch_width == 16)
1602      return false;
1603
1604   memset(last_mrf_move, 0, sizeof(last_mrf_move));
1605
1606   foreach_list_safe(node, &this->instructions) {
1607      fs_inst *inst = (fs_inst *)node;
1608
1609      switch (inst->opcode) {
1610      case BRW_OPCODE_DO:
1611      case BRW_OPCODE_WHILE:
1612      case BRW_OPCODE_IF:
1613      case BRW_OPCODE_ELSE:
1614      case BRW_OPCODE_ENDIF:
1615	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1616	 continue;
1617      default:
1618	 break;
1619      }
1620
1621      if (inst->opcode == BRW_OPCODE_MOV &&
1622	  inst->dst.file == MRF) {
1623	 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1624	 if (prev_inst && inst->equals(prev_inst)) {
1625	    inst->remove();
1626	    progress = true;
1627	    continue;
1628	 }
1629      }
1630
1631      /* Clear out the last-write records for MRFs that were overwritten. */
1632      if (inst->dst.file == MRF) {
1633	 last_mrf_move[inst->dst.reg] = NULL;
1634      }
1635
1636      if (inst->mlen > 0) {
1637	 /* Found a SEND instruction, which will include two or fewer
1638	  * implied MRF writes.  We could do better here.
1639	  */
1640	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1641	    last_mrf_move[inst->base_mrf + i] = NULL;
1642	 }
1643      }
1644
1645      /* Clear out any MRF move records whose sources got overwritten. */
1646      if (inst->dst.file == GRF) {
1647	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1648	    if (last_mrf_move[i] &&
1649		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1650	       last_mrf_move[i] = NULL;
1651	    }
1652	 }
1653      }
1654
1655      if (inst->opcode == BRW_OPCODE_MOV &&
1656	  inst->dst.file == MRF &&
1657	  inst->src[0].file == GRF &&
1658	  !inst->predicated) {
1659	 last_mrf_move[inst->dst.reg] = inst;
1660      }
1661   }
1662
1663   return progress;
1664}
1665
1666/**
1667 * Possibly returns an instruction that set up @param reg.
1668 *
1669 * Sometimes we want to take the result of some expression/variable
1670 * dereference tree and rewrite the instruction generating the result
1671 * of the tree.  When processing the tree, we know that the
1672 * instructions generated are all writing temporaries that are dead
1673 * outside of this tree.  So, if we have some instructions that write
1674 * a temporary, we're free to point that temp write somewhere else.
1675 *
1676 * Note that this doesn't guarantee that the instruction generated
1677 * only reg -- it might be the size=4 destination of a texture instruction.
1678 */
1679fs_inst *
1680fs_visitor::get_instruction_generating_reg(fs_inst *start,
1681					   fs_inst *end,
1682					   fs_reg reg)
1683{
1684   if (end == start ||
1685       end->predicated ||
1686       end->force_uncompressed ||
1687       end->force_sechalf ||
1688       !reg.equals(end->dst)) {
1689      return NULL;
1690   } else {
1691      return end;
1692   }
1693}
1694
1695bool
1696fs_visitor::run()
1697{
1698   uint32_t prog_offset_16 = 0;
1699   uint32_t orig_nr_params = c->prog_data.nr_params;
1700
1701   brw_wm_payload_setup(brw, c);
1702
1703   if (c->dispatch_width == 16) {
1704      /* align to 64 byte boundary. */
1705      while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
1706	 brw_NOP(p);
1707      }
1708
1709      /* Save off the start of this 16-wide program in case we succeed. */
1710      prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
1711
1712      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1713   }
1714
1715   if (0) {
1716      emit_dummy_fs();
1717   } else {
1718      calculate_urb_setup();
1719      if (intel->gen < 6)
1720	 emit_interpolation_setup_gen4();
1721      else
1722	 emit_interpolation_setup_gen6();
1723
1724      /* Generate FS IR for main().  (the visitor only descends into
1725       * functions called "main").
1726       */
1727      foreach_list(node, &*shader->ir) {
1728	 ir_instruction *ir = (ir_instruction *)node;
1729	 base_ir = ir;
1730	 this->result = reg_undef;
1731	 ir->accept(this);
1732      }
1733      if (failed)
1734	 return false;
1735
1736      emit_fb_writes();
1737
1738      split_virtual_grfs();
1739
1740      setup_paramvalues_refs();
1741      setup_pull_constants();
1742
1743      bool progress;
1744      do {
1745	 progress = false;
1746
1747	 progress = remove_duplicate_mrf_writes() || progress;
1748
1749	 progress = propagate_constants() || progress;
1750	 progress = opt_algebraic() || progress;
1751	 progress = opt_cse() || progress;
1752	 progress = opt_copy_propagate() || progress;
1753	 progress = register_coalesce() || progress;
1754	 progress = register_coalesce_2() || progress;
1755	 progress = compute_to_mrf() || progress;
1756	 progress = dead_code_eliminate() || progress;
1757      } while (progress);
1758
1759      remove_dead_constants();
1760
1761      schedule_instructions();
1762
1763      assign_curb_setup();
1764      assign_urb_setup();
1765
1766      if (0) {
1767	 /* Debug of register spilling: Go spill everything. */
1768	 int virtual_grf_count = virtual_grf_next;
1769	 for (int i = 0; i < virtual_grf_count; i++) {
1770	    spill_reg(i);
1771	 }
1772      }
1773
1774      if (0)
1775	 assign_regs_trivial();
1776      else {
1777	 while (!assign_regs()) {
1778	    if (failed)
1779	       break;
1780	 }
1781      }
1782   }
1783   assert(force_uncompressed_stack == 0);
1784   assert(force_sechalf_stack == 0);
1785
1786   if (failed)
1787      return false;
1788
1789   generate_code();
1790
1791   if (c->dispatch_width == 8) {
1792      c->prog_data.reg_blocks = brw_register_blocks(grf_used);
1793   } else {
1794      c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
1795      c->prog_data.prog_offset_16 = prog_offset_16;
1796
1797      /* Make sure we didn't try to sneak in an extra uniform */
1798      assert(orig_nr_params == c->prog_data.nr_params);
1799      (void) orig_nr_params;
1800   }
1801
1802   return !failed;
1803}
1804
1805bool
1806brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
1807	       struct gl_shader_program *prog)
1808{
1809   struct intel_context *intel = &brw->intel;
1810
1811   if (!prog)
1812      return false;
1813
1814   struct brw_shader *shader =
1815     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
1816   if (!shader)
1817      return false;
1818
1819   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1820      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
1821      _mesa_print_ir(shader->ir, NULL);
1822      printf("\n\n");
1823   }
1824
1825   /* Now the main event: Visit the shader IR and generate our FS IR for it.
1826    */
1827   c->dispatch_width = 8;
1828
1829   fs_visitor v(c, prog, shader);
1830   if (!v.run()) {
1831      prog->LinkStatus = false;
1832      ralloc_strcat(&prog->InfoLog, v.fail_msg);
1833
1834      _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
1835		    v.fail_msg);
1836
1837      return false;
1838   }
1839
1840   if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
1841      c->dispatch_width = 16;
1842      fs_visitor v2(c, prog, shader);
1843      v2.import_uniforms(&v);
1844      v2.run();
1845   }
1846
1847   c->prog_data.dispatch_width = 8;
1848
1849   return true;
1850}
1851
1852bool
1853brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
1854{
1855   struct brw_context *brw = brw_context(ctx);
1856   struct brw_wm_prog_key key;
1857
1858   /* As a temporary measure we assume that all programs use dFdy() (and hence
1859    * need to be compiled differently depending on whether we're rendering to
1860    * an FBO).  FIXME: set this bool correctly based on the contents of the
1861    * program.
1862    */
1863   bool program_uses_dfdy = true;
1864
1865   if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
1866      return true;
1867
1868   struct gl_fragment_program *fp = (struct gl_fragment_program *)
1869      prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
1870   struct brw_fragment_program *bfp = brw_fragment_program(fp);
1871
1872   memset(&key, 0, sizeof(key));
1873
1874   if (fp->UsesKill)
1875      key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
1876
1877   if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
1878      key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
1879
1880   /* Just assume depth testing. */
1881   key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
1882   key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
1883
1884   key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
1885   for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1886      if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
1887	 continue;
1888
1889      key.proj_attrib_mask |= 1 << i;
1890
1891      int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1892
1893      if (vp_index >= 0)
1894	 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
1895   }
1896
1897   key.clamp_fragment_color = true;
1898
1899   for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) {
1900      if (fp->Base.ShadowSamplers & (1 << i))
1901	 key.tex.compare_funcs[i] = GL_LESS;
1902
1903      /* FINISHME: depth compares might use (0,0,0,W) for example */
1904      key.tex.swizzles[i] = SWIZZLE_XYZW;
1905   }
1906
1907   if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
1908      key.drawable_height = ctx->DrawBuffer->Height;
1909   }
1910
1911   if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
1912      key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
1913   }
1914
1915   key.nr_color_regions = 1;
1916
1917   key.program_string_id = bfp->id;
1918
1919   uint32_t old_prog_offset = brw->wm.prog_offset;
1920   struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
1921
1922   bool success = do_wm_prog(brw, prog, bfp, &key);
1923
1924   brw->wm.prog_offset = old_prog_offset;
1925   brw->wm.prog_data = old_prog_data;
1926
1927   return success;
1928}
1929