brw_fs.cpp revision 82d25963a838cfebdeb9b080169979329ee850ea
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31extern "C" {
32
33#include <sys/types.h>
34
35#include "main/macros.h"
36#include "main/shaderobj.h"
37#include "main/uniforms.h"
38#include "main/fbobject.h"
39#include "program/prog_parameter.h"
40#include "program/prog_print.h"
41#include "program/register_allocate.h"
42#include "program/sampler.h"
43#include "program/hash_table.h"
44#include "brw_context.h"
45#include "brw_eu.h"
46#include "brw_wm.h"
47}
48#include "brw_shader.h"
49#include "brw_fs.h"
50#include "glsl/glsl_types.h"
51#include "glsl/ir_print_visitor.h"
52
53int
54fs_visitor::type_size(const struct glsl_type *type)
55{
56   unsigned int size, i;
57
58   switch (type->base_type) {
59   case GLSL_TYPE_UINT:
60   case GLSL_TYPE_INT:
61   case GLSL_TYPE_FLOAT:
62   case GLSL_TYPE_BOOL:
63      return type->components();
64   case GLSL_TYPE_ARRAY:
65      return type_size(type->fields.array) * type->length;
66   case GLSL_TYPE_STRUCT:
67      size = 0;
68      for (i = 0; i < type->length; i++) {
69	 size += type_size(type->fields.structure[i].type);
70      }
71      return size;
72   case GLSL_TYPE_SAMPLER:
73      /* Samplers take up no register space, since they're baked in at
74       * link time.
75       */
76      return 0;
77   default:
78      assert(!"not reached");
79      return 0;
80   }
81}
82
83void
84fs_visitor::fail(const char *format, ...)
85{
86   va_list va;
87   char *msg;
88
89   if (failed)
90      return;
91
92   failed = true;
93
94   va_start(va, format);
95   msg = ralloc_vasprintf(mem_ctx, format, va);
96   va_end(va);
97   msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
98
99   this->fail_msg = msg;
100
101   if (INTEL_DEBUG & DEBUG_WM) {
102      fprintf(stderr, "%s",  msg);
103   }
104}
105
106void
107fs_visitor::push_force_uncompressed()
108{
109   force_uncompressed_stack++;
110}
111
112void
113fs_visitor::pop_force_uncompressed()
114{
115   force_uncompressed_stack--;
116   assert(force_uncompressed_stack >= 0);
117}
118
119void
120fs_visitor::push_force_sechalf()
121{
122   force_sechalf_stack++;
123}
124
125void
126fs_visitor::pop_force_sechalf()
127{
128   force_sechalf_stack--;
129   assert(force_sechalf_stack >= 0);
130}
131
132/**
133 * Returns how many MRFs an FS opcode will write over.
134 *
135 * Note that this is not the 0 or 1 implied writes in an actual gen
136 * instruction -- the FS opcodes often generate MOVs in addition.
137 */
138int
139fs_visitor::implied_mrf_writes(fs_inst *inst)
140{
141   if (inst->mlen == 0)
142      return 0;
143
144   switch (inst->opcode) {
145   case SHADER_OPCODE_RCP:
146   case SHADER_OPCODE_RSQ:
147   case SHADER_OPCODE_SQRT:
148   case SHADER_OPCODE_EXP2:
149   case SHADER_OPCODE_LOG2:
150   case SHADER_OPCODE_SIN:
151   case SHADER_OPCODE_COS:
152      return 1 * c->dispatch_width / 8;
153   case SHADER_OPCODE_POW:
154   case SHADER_OPCODE_INT_QUOTIENT:
155   case SHADER_OPCODE_INT_REMAINDER:
156      return 2 * c->dispatch_width / 8;
157   case SHADER_OPCODE_TEX:
158   case FS_OPCODE_TXB:
159   case SHADER_OPCODE_TXD:
160   case SHADER_OPCODE_TXF:
161   case SHADER_OPCODE_TXL:
162   case SHADER_OPCODE_TXS:
163      return 1;
164   case FS_OPCODE_FB_WRITE:
165      return 2;
166   case FS_OPCODE_PULL_CONSTANT_LOAD:
167   case FS_OPCODE_UNSPILL:
168      return 1;
169   case FS_OPCODE_SPILL:
170      return 2;
171   default:
172      assert(!"not reached");
173      return inst->mlen;
174   }
175}
176
177int
178fs_visitor::virtual_grf_alloc(int size)
179{
180   if (virtual_grf_array_size <= virtual_grf_next) {
181      if (virtual_grf_array_size == 0)
182	 virtual_grf_array_size = 16;
183      else
184	 virtual_grf_array_size *= 2;
185      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
186				   virtual_grf_array_size);
187   }
188   virtual_grf_sizes[virtual_grf_next] = size;
189   return virtual_grf_next++;
190}
191
192/** Fixed HW reg constructor. */
193fs_reg::fs_reg(enum register_file file, int reg)
194{
195   init();
196   this->file = file;
197   this->reg = reg;
198   this->type = BRW_REGISTER_TYPE_F;
199}
200
201/** Fixed HW reg constructor. */
202fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
203{
204   init();
205   this->file = file;
206   this->reg = reg;
207   this->type = type;
208}
209
210/** Automatic reg constructor. */
211fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
212{
213   init();
214
215   this->file = GRF;
216   this->reg = v->virtual_grf_alloc(v->type_size(type));
217   this->reg_offset = 0;
218   this->type = brw_type_for_base_type(type);
219}
220
221fs_reg *
222fs_visitor::variable_storage(ir_variable *var)
223{
224   return (fs_reg *)hash_table_find(this->variable_ht, var);
225}
226
227void
228import_uniforms_callback(const void *key,
229			 void *data,
230			 void *closure)
231{
232   struct hash_table *dst_ht = (struct hash_table *)closure;
233   const fs_reg *reg = (const fs_reg *)data;
234
235   if (reg->file != UNIFORM)
236      return;
237
238   hash_table_insert(dst_ht, data, key);
239}
240
241/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
242 * This brings in those uniform definitions
243 */
244void
245fs_visitor::import_uniforms(fs_visitor *v)
246{
247   hash_table_call_foreach(v->variable_ht,
248			   import_uniforms_callback,
249			   variable_ht);
250   this->params_remap = v->params_remap;
251}
252
253/* Our support for uniforms is piggy-backed on the struct
254 * gl_fragment_program, because that's where the values actually
255 * get stored, rather than in some global gl_shader_program uniform
256 * store.
257 */
258int
259fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
260{
261   unsigned int offset = 0;
262
263   if (type->is_matrix()) {
264      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
265							type->vector_elements,
266							1);
267
268      for (unsigned int i = 0; i < type->matrix_columns; i++) {
269	 offset += setup_uniform_values(loc + offset, column);
270      }
271
272      return offset;
273   }
274
275   switch (type->base_type) {
276   case GLSL_TYPE_FLOAT:
277   case GLSL_TYPE_UINT:
278   case GLSL_TYPE_INT:
279   case GLSL_TYPE_BOOL:
280      for (unsigned int i = 0; i < type->vector_elements; i++) {
281	 unsigned int param = c->prog_data.nr_params++;
282
283	 assert(param < ARRAY_SIZE(c->prog_data.param));
284
285	 if (ctx->Const.NativeIntegers) {
286	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
287	 } else {
288	    switch (type->base_type) {
289	    case GLSL_TYPE_FLOAT:
290	       c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
291	       break;
292	    case GLSL_TYPE_UINT:
293	       c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
294	       break;
295	    case GLSL_TYPE_INT:
296	       c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
297	       break;
298	    case GLSL_TYPE_BOOL:
299	       c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
300	       break;
301	    default:
302	       assert(!"not reached");
303	       c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
304	       break;
305	    }
306	 }
307	 this->param_index[param] = loc;
308	 this->param_offset[param] = i;
309      }
310      return 1;
311
312   case GLSL_TYPE_STRUCT:
313      for (unsigned int i = 0; i < type->length; i++) {
314	 offset += setup_uniform_values(loc + offset,
315					type->fields.structure[i].type);
316      }
317      return offset;
318
319   case GLSL_TYPE_ARRAY:
320      for (unsigned int i = 0; i < type->length; i++) {
321	 offset += setup_uniform_values(loc + offset, type->fields.array);
322      }
323      return offset;
324
325   case GLSL_TYPE_SAMPLER:
326      /* The sampler takes up a slot, but we don't use any values from it. */
327      return 1;
328
329   default:
330      assert(!"not reached");
331      return 0;
332   }
333}
334
335
336/* Our support for builtin uniforms is even scarier than non-builtin.
337 * It sits on top of the PROG_STATE_VAR parameters that are
338 * automatically updated from GL context state.
339 */
340void
341fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
342{
343   const ir_state_slot *const slots = ir->state_slots;
344   assert(ir->state_slots != NULL);
345
346   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
347      /* This state reference has already been setup by ir_to_mesa, but we'll
348       * get the same index back here.
349       */
350      int index = _mesa_add_state_reference(this->fp->Base.Parameters,
351					    (gl_state_index *)slots[i].tokens);
352
353      /* Add each of the unique swizzles of the element as a parameter.
354       * This'll end up matching the expected layout of the
355       * array/matrix/structure we're trying to fill in.
356       */
357      int last_swiz = -1;
358      for (unsigned int j = 0; j < 4; j++) {
359	 int swiz = GET_SWZ(slots[i].swizzle, j);
360	 if (swiz == last_swiz)
361	    break;
362	 last_swiz = swiz;
363
364	 c->prog_data.param_convert[c->prog_data.nr_params] =
365	    PARAM_NO_CONVERT;
366	 this->param_index[c->prog_data.nr_params] = index;
367	 this->param_offset[c->prog_data.nr_params] = swiz;
368	 c->prog_data.nr_params++;
369      }
370   }
371}
372
373fs_reg *
374fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
375{
376   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
377   fs_reg wpos = *reg;
378   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
379
380   /* gl_FragCoord.x */
381   if (ir->pixel_center_integer) {
382      emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
383   } else {
384      emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
385   }
386   wpos.reg_offset++;
387
388   /* gl_FragCoord.y */
389   if (!flip && ir->pixel_center_integer) {
390      emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
391   } else {
392      fs_reg pixel_y = this->pixel_y;
393      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
394
395      if (flip) {
396	 pixel_y.negate = true;
397	 offset += c->key.drawable_height - 1.0;
398      }
399
400      emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
401   }
402   wpos.reg_offset++;
403
404   /* gl_FragCoord.z */
405   if (intel->gen >= 6) {
406      emit(BRW_OPCODE_MOV, wpos,
407	   fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
408   } else {
409      emit(FS_OPCODE_LINTERP, wpos,
410           this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
411           this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
412           interp_reg(FRAG_ATTRIB_WPOS, 2));
413   }
414   wpos.reg_offset++;
415
416   /* gl_FragCoord.w: Already set up in emit_interpolation */
417   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
418
419   return reg;
420}
421
422fs_reg *
423fs_visitor::emit_general_interpolation(ir_variable *ir)
424{
425   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
426   reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
427   fs_reg attr = *reg;
428
429   unsigned int array_elements;
430   const glsl_type *type;
431
432   if (ir->type->is_array()) {
433      array_elements = ir->type->length;
434      if (array_elements == 0) {
435	 fail("dereferenced array '%s' has length 0\n", ir->name);
436      }
437      type = ir->type->fields.array;
438   } else {
439      array_elements = 1;
440      type = ir->type;
441   }
442
443   glsl_interp_qualifier interpolation_mode =
444      ir->determine_interpolation_mode(c->key.flat_shade);
445
446   int location = ir->location;
447   for (unsigned int i = 0; i < array_elements; i++) {
448      for (unsigned int j = 0; j < type->matrix_columns; j++) {
449	 if (urb_setup[location] == -1) {
450	    /* If there's no incoming setup data for this slot, don't
451	     * emit interpolation for it.
452	     */
453	    attr.reg_offset += type->vector_elements;
454	    location++;
455	    continue;
456	 }
457
458	 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
459	    /* Constant interpolation (flat shading) case. The SF has
460	     * handed us defined values in only the constant offset
461	     * field of the setup reg.
462	     */
463	    for (unsigned int k = 0; k < type->vector_elements; k++) {
464	       struct brw_reg interp = interp_reg(location, k);
465	       interp = suboffset(interp, 3);
466               interp.type = reg->type;
467	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
468	       attr.reg_offset++;
469	    }
470	 } else {
471	    /* Smooth/noperspective interpolation case. */
472	    for (unsigned int k = 0; k < type->vector_elements; k++) {
473	       /* FINISHME: At some point we probably want to push
474		* this farther by giving similar treatment to the
475		* other potentially constant components of the
476		* attribute, as well as making brw_vs_constval.c
477		* handle varyings other than gl_TexCoord.
478		*/
479	       if (location >= FRAG_ATTRIB_TEX0 &&
480		   location <= FRAG_ATTRIB_TEX7 &&
481		   k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
482		  emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
483	       } else {
484		  struct brw_reg interp = interp_reg(location, k);
485                  brw_wm_barycentric_interp_mode barycoord_mode;
486                  if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
487                     barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
488                  else
489                     barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
490                  emit(FS_OPCODE_LINTERP, attr,
491                       this->delta_x[barycoord_mode],
492                       this->delta_y[barycoord_mode], fs_reg(interp));
493		  if (intel->gen < 6) {
494		     emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
495		  }
496	       }
497	       attr.reg_offset++;
498	    }
499
500	 }
501	 location++;
502      }
503   }
504
505   return reg;
506}
507
508fs_reg *
509fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
510{
511   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
512
513   /* The frontfacing comes in as a bit in the thread payload. */
514   if (intel->gen >= 6) {
515      emit(BRW_OPCODE_ASR, *reg,
516	   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
517	   fs_reg(15));
518      emit(BRW_OPCODE_NOT, *reg, *reg);
519      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
520   } else {
521      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
522      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
523       * us front face
524       */
525      fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
526			   fs_reg(r1_6ud),
527			   fs_reg(1u << 31));
528      inst->conditional_mod = BRW_CONDITIONAL_L;
529      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
530   }
531
532   return reg;
533}
534
535fs_inst *
536fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
537{
538   switch (opcode) {
539   case SHADER_OPCODE_RCP:
540   case SHADER_OPCODE_RSQ:
541   case SHADER_OPCODE_SQRT:
542   case SHADER_OPCODE_EXP2:
543   case SHADER_OPCODE_LOG2:
544   case SHADER_OPCODE_SIN:
545   case SHADER_OPCODE_COS:
546      break;
547   default:
548      assert(!"not reached: bad math opcode");
549      return NULL;
550   }
551
552   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
553    * might be able to do better by doing execsize = 1 math and then
554    * expanding that result out, but we would need to be careful with
555    * masking.
556    *
557    * Gen 6 hardware ignores source modifiers (negate and abs) on math
558    * instructions, so we also move to a temp to set those up.
559    */
560   if (intel->gen == 6 && (src.file == UNIFORM ||
561			   src.abs ||
562			   src.negate)) {
563      fs_reg expanded = fs_reg(this, glsl_type::float_type);
564      emit(BRW_OPCODE_MOV, expanded, src);
565      src = expanded;
566   }
567
568   fs_inst *inst = emit(opcode, dst, src);
569
570   if (intel->gen < 6) {
571      inst->base_mrf = 2;
572      inst->mlen = c->dispatch_width / 8;
573   }
574
575   return inst;
576}
577
578fs_inst *
579fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
580{
581   int base_mrf = 2;
582   fs_inst *inst;
583
584   switch (opcode) {
585   case SHADER_OPCODE_POW:
586   case SHADER_OPCODE_INT_QUOTIENT:
587   case SHADER_OPCODE_INT_REMAINDER:
588      break;
589   default:
590      assert(!"not reached: unsupported binary math opcode.");
591      return NULL;
592   }
593
594   if (intel->gen >= 7) {
595      inst = emit(opcode, dst, src0, src1);
596   } else if (intel->gen == 6) {
597      /* Can't do hstride == 0 args to gen6 math, so expand it out.
598       *
599       * The hardware ignores source modifiers (negate and abs) on math
600       * instructions, so we also move to a temp to set those up.
601       */
602      if (src0.file == UNIFORM || src0.abs || src0.negate) {
603	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
604	 expanded.type = src0.type;
605	 emit(BRW_OPCODE_MOV, expanded, src0);
606	 src0 = expanded;
607      }
608
609      if (src1.file == UNIFORM || src1.abs || src1.negate) {
610	 fs_reg expanded = fs_reg(this, glsl_type::float_type);
611	 expanded.type = src1.type;
612	 emit(BRW_OPCODE_MOV, expanded, src1);
613	 src1 = expanded;
614      }
615
616      inst = emit(opcode, dst, src0, src1);
617   } else {
618      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
619       * "Message Payload":
620       *
621       * "Operand0[7].  For the INT DIV functions, this operand is the
622       *  denominator."
623       *  ...
624       * "Operand1[7].  For the INT DIV functions, this operand is the
625       *  numerator."
626       */
627      bool is_int_div = opcode != SHADER_OPCODE_POW;
628      fs_reg &op0 = is_int_div ? src1 : src0;
629      fs_reg &op1 = is_int_div ? src0 : src1;
630
631      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
632      inst = emit(opcode, dst, op0, reg_null_f);
633
634      inst->base_mrf = base_mrf;
635      inst->mlen = 2 * c->dispatch_width / 8;
636   }
637   return inst;
638}
639
640/**
641 * To be called after the last _mesa_add_state_reference() call, to
642 * set up prog_data.param[] for assign_curb_setup() and
643 * setup_pull_constants().
644 */
645void
646fs_visitor::setup_paramvalues_refs()
647{
648   if (c->dispatch_width != 8)
649      return;
650
651   /* Set up the pointers to ParamValues now that that array is finalized. */
652   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
653      c->prog_data.param[i] =
654	 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
655	 this->param_offset[i];
656   }
657}
658
659void
660fs_visitor::assign_curb_setup()
661{
662   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
663   if (c->dispatch_width == 8) {
664      c->prog_data.first_curbe_grf = c->nr_payload_regs;
665   } else {
666      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
667   }
668
669   /* Map the offsets in the UNIFORM file to fixed HW regs. */
670   foreach_list(node, &this->instructions) {
671      fs_inst *inst = (fs_inst *)node;
672
673      for (unsigned int i = 0; i < 3; i++) {
674	 if (inst->src[i].file == UNIFORM) {
675	    int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
676	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
677						  constant_nr / 8,
678						  constant_nr % 8);
679
680	    inst->src[i].file = FIXED_HW_REG;
681	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
682	 }
683      }
684   }
685}
686
687void
688fs_visitor::calculate_urb_setup()
689{
690   for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
691      urb_setup[i] = -1;
692   }
693
694   int urb_next = 0;
695   /* Figure out where each of the incoming setup attributes lands. */
696   if (intel->gen >= 6) {
697      for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
698	 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
699	    urb_setup[i] = urb_next++;
700	 }
701      }
702   } else {
703      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
704      for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
705	 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
706	    int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
707
708	    if (fp_index >= 0)
709	       urb_setup[fp_index] = urb_next++;
710	 }
711      }
712
713      /*
714       * It's a FS only attribute, and we did interpolation for this attribute
715       * in SF thread. So, count it here, too.
716       *
717       * See compile_sf_prog() for more info.
718       */
719      if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
720         urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
721   }
722
723   /* Each attribute is 4 setup channels, each of which is half a reg. */
724   c->prog_data.urb_read_length = urb_next * 2;
725}
726
727void
728fs_visitor::assign_urb_setup()
729{
730   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
731
732   /* Offset all the urb_setup[] index by the actual position of the
733    * setup regs, now that the location of the constants has been chosen.
734    */
735   foreach_list(node, &this->instructions) {
736      fs_inst *inst = (fs_inst *)node;
737
738      if (inst->opcode == FS_OPCODE_LINTERP) {
739	 assert(inst->src[2].file == FIXED_HW_REG);
740	 inst->src[2].fixed_hw_reg.nr += urb_start;
741      }
742
743      if (inst->opcode == FS_OPCODE_CINTERP) {
744	 assert(inst->src[0].file == FIXED_HW_REG);
745	 inst->src[0].fixed_hw_reg.nr += urb_start;
746      }
747   }
748
749   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
750}
751
752/**
753 * Split large virtual GRFs into separate components if we can.
754 *
755 * This is mostly duplicated with what brw_fs_vector_splitting does,
756 * but that's really conservative because it's afraid of doing
757 * splitting that doesn't result in real progress after the rest of
758 * the optimization phases, which would cause infinite looping in
759 * optimization.  We can do it once here, safely.  This also has the
760 * opportunity to split interpolated values, or maybe even uniforms,
761 * which we don't have at the IR level.
762 *
763 * We want to split, because virtual GRFs are what we register
764 * allocate and spill (due to contiguousness requirements for some
765 * instructions), and they're what we naturally generate in the
766 * codegen process, but most virtual GRFs don't actually need to be
767 * contiguous sets of GRFs.  If we split, we'll end up with reduced
768 * live intervals and better dead code elimination and coalescing.
769 */
770void
771fs_visitor::split_virtual_grfs()
772{
773   int num_vars = this->virtual_grf_next;
774   bool split_grf[num_vars];
775   int new_virtual_grf[num_vars];
776
777   /* Try to split anything > 0 sized. */
778   for (int i = 0; i < num_vars; i++) {
779      if (this->virtual_grf_sizes[i] != 1)
780	 split_grf[i] = true;
781      else
782	 split_grf[i] = false;
783   }
784
785   if (brw->has_pln &&
786       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
787      /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
788       * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
789       * Gen6, that was the only supported interpolation mode, and since Gen6,
790       * delta_x and delta_y are in fixed hardware registers.
791       */
792      split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
793         false;
794   }
795
796   foreach_list(node, &this->instructions) {
797      fs_inst *inst = (fs_inst *)node;
798
799      /* Texturing produces 4 contiguous registers, so no splitting. */
800      if (inst->is_tex()) {
801	 split_grf[inst->dst.reg] = false;
802      }
803   }
804
805   /* Allocate new space for split regs.  Note that the virtual
806    * numbers will be contiguous.
807    */
808   for (int i = 0; i < num_vars; i++) {
809      if (split_grf[i]) {
810	 new_virtual_grf[i] = virtual_grf_alloc(1);
811	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
812	    int reg = virtual_grf_alloc(1);
813	    assert(reg == new_virtual_grf[i] + j - 1);
814	    (void) reg;
815	 }
816	 this->virtual_grf_sizes[i] = 1;
817      }
818   }
819
820   foreach_list(node, &this->instructions) {
821      fs_inst *inst = (fs_inst *)node;
822
823      if (inst->dst.file == GRF &&
824	  split_grf[inst->dst.reg] &&
825	  inst->dst.reg_offset != 0) {
826	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
827			  inst->dst.reg_offset - 1);
828	 inst->dst.reg_offset = 0;
829      }
830      for (int i = 0; i < 3; i++) {
831	 if (inst->src[i].file == GRF &&
832	     split_grf[inst->src[i].reg] &&
833	     inst->src[i].reg_offset != 0) {
834	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
835				inst->src[i].reg_offset - 1);
836	    inst->src[i].reg_offset = 0;
837	 }
838      }
839   }
840   this->live_intervals_valid = false;
841}
842
843bool
844fs_visitor::remove_dead_constants()
845{
846   if (c->dispatch_width == 8) {
847      this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
848
849      for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
850	 this->params_remap[i] = -1;
851
852      /* Find which params are still in use. */
853      foreach_list(node, &this->instructions) {
854	 fs_inst *inst = (fs_inst *)node;
855
856	 for (int i = 0; i < 3; i++) {
857	    int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
858
859	    if (inst->src[i].file != UNIFORM)
860	       continue;
861
862	    assert(constant_nr < (int)c->prog_data.nr_params);
863
864	    /* For now, set this to non-negative.  We'll give it the
865	     * actual new number in a moment, in order to keep the
866	     * register numbers nicely ordered.
867	     */
868	    this->params_remap[constant_nr] = 0;
869	 }
870      }
871
872      /* Figure out what the new numbers for the params will be.  At some
873       * point when we're doing uniform array access, we're going to want
874       * to keep the distinction between .reg and .reg_offset, but for
875       * now we don't care.
876       */
877      unsigned int new_nr_params = 0;
878      for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
879	 if (this->params_remap[i] != -1) {
880	    this->params_remap[i] = new_nr_params++;
881	 }
882      }
883
884      /* Update the list of params to be uploaded to match our new numbering. */
885      for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
886	 int remapped = this->params_remap[i];
887
888	 if (remapped == -1)
889	    continue;
890
891	 /* We've already done setup_paramvalues_refs() so no need to worry
892	  * about param_index and param_offset.
893	  */
894	 c->prog_data.param[remapped] = c->prog_data.param[i];
895	 c->prog_data.param_convert[remapped] = c->prog_data.param_convert[i];
896      }
897
898      c->prog_data.nr_params = new_nr_params;
899   } else {
900      /* This should have been generated in the 8-wide pass already. */
901      assert(this->params_remap);
902   }
903
904   /* Now do the renumbering of the shader to remove unused params. */
905   foreach_list(node, &this->instructions) {
906      fs_inst *inst = (fs_inst *)node;
907
908      for (int i = 0; i < 3; i++) {
909	 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
910
911	 if (inst->src[i].file != UNIFORM)
912	    continue;
913
914	 assert(this->params_remap[constant_nr] != -1);
915	 inst->src[i].reg = this->params_remap[constant_nr];
916	 inst->src[i].reg_offset = 0;
917      }
918   }
919
920   return true;
921}
922
923/**
924 * Choose accesses from the UNIFORM file to demote to using the pull
925 * constant buffer.
926 *
927 * We allow a fragment shader to have more than the specified minimum
928 * maximum number of fragment shader uniform components (64).  If
929 * there are too many of these, they'd fill up all of register space.
930 * So, this will push some of them out to the pull constant buffer and
931 * update the program to load them.
932 */
933void
934fs_visitor::setup_pull_constants()
935{
936   /* Only allow 16 registers (128 uniform components) as push constants. */
937   unsigned int max_uniform_components = 16 * 8;
938   if (c->prog_data.nr_params <= max_uniform_components)
939      return;
940
941   if (c->dispatch_width == 16) {
942      fail("Pull constants not supported in 16-wide\n");
943      return;
944   }
945
946   /* Just demote the end of the list.  We could probably do better
947    * here, demoting things that are rarely used in the program first.
948    */
949   int pull_uniform_base = max_uniform_components;
950   int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
951
952   foreach_list(node, &this->instructions) {
953      fs_inst *inst = (fs_inst *)node;
954
955      for (int i = 0; i < 3; i++) {
956	 if (inst->src[i].file != UNIFORM)
957	    continue;
958
959	 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
960	 if (uniform_nr < pull_uniform_base)
961	    continue;
962
963	 fs_reg dst = fs_reg(this, glsl_type::float_type);
964	 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
965					      dst);
966	 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
967	 pull->ir = inst->ir;
968	 pull->annotation = inst->annotation;
969	 pull->base_mrf = 14;
970	 pull->mlen = 1;
971
972	 inst->insert_before(pull);
973
974	 inst->src[i].file = GRF;
975	 inst->src[i].reg = dst.reg;
976	 inst->src[i].reg_offset = 0;
977	 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
978      }
979   }
980
981   for (int i = 0; i < pull_uniform_count; i++) {
982      c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
983      c->prog_data.pull_param_convert[i] =
984	 c->prog_data.param_convert[pull_uniform_base + i];
985   }
986   c->prog_data.nr_params -= pull_uniform_count;
987   c->prog_data.nr_pull_params = pull_uniform_count;
988}
989
990/**
991 * Attempts to move immediate constants into the immediate
992 * constant slot of following instructions.
993 *
994 * Immediate constants are a bit tricky -- they have to be in the last
995 * operand slot, you can't do abs/negate on them,
996 */
997
998bool
999fs_visitor::propagate_constants()
1000{
1001   bool progress = false;
1002
1003   calculate_live_intervals();
1004
1005   foreach_list(node, &this->instructions) {
1006      fs_inst *inst = (fs_inst *)node;
1007
1008      if (inst->opcode != BRW_OPCODE_MOV ||
1009	  inst->predicated ||
1010	  inst->dst.file != GRF || inst->src[0].file != IMM ||
1011	  inst->dst.type != inst->src[0].type ||
1012	  (c->dispatch_width == 16 &&
1013	   (inst->force_uncompressed || inst->force_sechalf)))
1014	 continue;
1015
1016      /* Don't bother with cases where we should have had the
1017       * operation on the constant folded in GLSL already.
1018       */
1019      if (inst->saturate)
1020	 continue;
1021
1022      /* Found a move of a constant to a GRF.  Find anything else using the GRF
1023       * before it's written, and replace it with the constant if we can.
1024       */
1025      for (fs_inst *scan_inst = (fs_inst *)inst->next;
1026	   !scan_inst->is_tail_sentinel();
1027	   scan_inst = (fs_inst *)scan_inst->next) {
1028	 if (scan_inst->opcode == BRW_OPCODE_DO ||
1029	     scan_inst->opcode == BRW_OPCODE_WHILE ||
1030	     scan_inst->opcode == BRW_OPCODE_ELSE ||
1031	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
1032	    break;
1033	 }
1034
1035	 for (int i = 2; i >= 0; i--) {
1036	    if (scan_inst->src[i].file != GRF ||
1037		scan_inst->src[i].reg != inst->dst.reg ||
1038		scan_inst->src[i].reg_offset != inst->dst.reg_offset)
1039	       continue;
1040
1041	    /* Don't bother with cases where we should have had the
1042	     * operation on the constant folded in GLSL already.
1043	     */
1044	    if (scan_inst->src[i].negate || scan_inst->src[i].abs)
1045	       continue;
1046
1047	    switch (scan_inst->opcode) {
1048	    case BRW_OPCODE_MOV:
1049	       scan_inst->src[i] = inst->src[0];
1050	       progress = true;
1051	       break;
1052
1053	    case BRW_OPCODE_MUL:
1054	    case BRW_OPCODE_ADD:
1055	       if (i == 1) {
1056		  scan_inst->src[i] = inst->src[0];
1057		  progress = true;
1058	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
1059		  /* Fit this constant in by commuting the operands.
1060		   * Exception: we can't do this for 32-bit integer MUL
1061		   * because it's asymmetric.
1062		   */
1063		  if (scan_inst->opcode == BRW_OPCODE_MUL &&
1064		      (scan_inst->src[1].type == BRW_REGISTER_TYPE_D ||
1065		       scan_inst->src[1].type == BRW_REGISTER_TYPE_UD))
1066		     break;
1067		  scan_inst->src[0] = scan_inst->src[1];
1068		  scan_inst->src[1] = inst->src[0];
1069		  progress = true;
1070	       }
1071	       break;
1072
1073	    case BRW_OPCODE_CMP:
1074	    case BRW_OPCODE_IF:
1075	       if (i == 1) {
1076		  scan_inst->src[i] = inst->src[0];
1077		  progress = true;
1078	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
1079		  uint32_t new_cmod;
1080
1081		  new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
1082		  if (new_cmod != ~0u) {
1083		     /* Fit this constant in by swapping the operands and
1084		      * flipping the test
1085		      */
1086		     scan_inst->src[0] = scan_inst->src[1];
1087		     scan_inst->src[1] = inst->src[0];
1088		     scan_inst->conditional_mod = new_cmod;
1089		     progress = true;
1090		  }
1091	       }
1092	       break;
1093
1094	    case BRW_OPCODE_SEL:
1095	       if (i == 1) {
1096		  scan_inst->src[i] = inst->src[0];
1097		  progress = true;
1098	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
1099		  scan_inst->src[0] = scan_inst->src[1];
1100		  scan_inst->src[1] = inst->src[0];
1101
1102		  /* If this was predicated, flipping operands means
1103		   * we also need to flip the predicate.
1104		   */
1105		  if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) {
1106		     scan_inst->predicate_inverse =
1107			!scan_inst->predicate_inverse;
1108		  }
1109		  progress = true;
1110	       }
1111	       break;
1112
1113	    case SHADER_OPCODE_RCP:
1114	       /* The hardware doesn't do math on immediate values
1115		* (because why are you doing that, seriously?), but
1116		* the correct answer is to just constant fold it
1117		* anyway.
1118		*/
1119	       assert(i == 0);
1120	       if (inst->src[0].imm.f != 0.0f) {
1121		  scan_inst->opcode = BRW_OPCODE_MOV;
1122		  scan_inst->src[0] = inst->src[0];
1123		  scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f;
1124		  progress = true;
1125	       }
1126	       break;
1127
1128	    default:
1129	       break;
1130	    }
1131	 }
1132
1133	 if (scan_inst->dst.file == GRF &&
1134	     scan_inst->dst.reg == inst->dst.reg &&
1135	     (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1136	      scan_inst->is_tex())) {
1137	    break;
1138	 }
1139      }
1140   }
1141
1142   if (progress)
1143       this->live_intervals_valid = false;
1144
1145   return progress;
1146}
1147
1148
1149/**
1150 * Attempts to move immediate constants into the immediate
1151 * constant slot of following instructions.
1152 *
1153 * Immediate constants are a bit tricky -- they have to be in the last
1154 * operand slot, you can't do abs/negate on them,
1155 */
1156
1157bool
1158fs_visitor::opt_algebraic()
1159{
1160   bool progress = false;
1161
1162   calculate_live_intervals();
1163
1164   foreach_list(node, &this->instructions) {
1165      fs_inst *inst = (fs_inst *)node;
1166
1167      switch (inst->opcode) {
1168      case BRW_OPCODE_MUL:
1169	 if (inst->src[1].file != IMM)
1170	    continue;
1171
1172	 /* a * 1.0 = a */
1173	 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1174	     inst->src[1].imm.f == 1.0) {
1175	    inst->opcode = BRW_OPCODE_MOV;
1176	    inst->src[1] = reg_undef;
1177	    progress = true;
1178	    break;
1179	 }
1180
1181	 break;
1182      default:
1183	 break;
1184      }
1185   }
1186
1187   return progress;
1188}
1189
1190/**
1191 * Must be called after calculate_live_intervales() to remove unused
1192 * writes to registers -- register allocation will fail otherwise
1193 * because something deffed but not used won't be considered to
1194 * interfere with other regs.
1195 */
1196bool
1197fs_visitor::dead_code_eliminate()
1198{
1199   bool progress = false;
1200   int pc = 0;
1201
1202   calculate_live_intervals();
1203
1204   foreach_list_safe(node, &this->instructions) {
1205      fs_inst *inst = (fs_inst *)node;
1206
1207      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1208	 inst->remove();
1209	 progress = true;
1210      }
1211
1212      pc++;
1213   }
1214
1215   if (progress)
1216      live_intervals_valid = false;
1217
1218   return progress;
1219}
1220
1221/**
1222 * Implements a second type of register coalescing: This one checks if
1223 * the two regs involved in a raw move don't interfere, in which case
1224 * they can both by stored in the same place and the MOV removed.
1225 */
1226bool
1227fs_visitor::register_coalesce_2()
1228{
1229   bool progress = false;
1230
1231   calculate_live_intervals();
1232
1233   foreach_list_safe(node, &this->instructions) {
1234      fs_inst *inst = (fs_inst *)node;
1235
1236      if (inst->opcode != BRW_OPCODE_MOV ||
1237	  inst->predicated ||
1238	  inst->saturate ||
1239	  inst->src[0].file != GRF ||
1240	  inst->src[0].negate ||
1241	  inst->src[0].abs ||
1242	  inst->src[0].smear != -1 ||
1243	  inst->dst.file != GRF ||
1244	  inst->dst.type != inst->src[0].type ||
1245	  virtual_grf_sizes[inst->src[0].reg] != 1 ||
1246	  virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1247	 continue;
1248      }
1249
1250      int reg_from = inst->src[0].reg;
1251      assert(inst->src[0].reg_offset == 0);
1252      int reg_to = inst->dst.reg;
1253      int reg_to_offset = inst->dst.reg_offset;
1254
1255      foreach_list_safe(node, &this->instructions) {
1256	 fs_inst *scan_inst = (fs_inst *)node;
1257
1258	 if (scan_inst->dst.file == GRF &&
1259	     scan_inst->dst.reg == reg_from) {
1260	    scan_inst->dst.reg = reg_to;
1261	    scan_inst->dst.reg_offset = reg_to_offset;
1262	 }
1263	 for (int i = 0; i < 3; i++) {
1264	    if (scan_inst->src[i].file == GRF &&
1265		scan_inst->src[i].reg == reg_from) {
1266	       scan_inst->src[i].reg = reg_to;
1267	       scan_inst->src[i].reg_offset = reg_to_offset;
1268	    }
1269	 }
1270      }
1271
1272      inst->remove();
1273      live_intervals_valid = false;
1274      progress = true;
1275      continue;
1276   }
1277
1278   return progress;
1279}
1280
1281bool
1282fs_visitor::register_coalesce()
1283{
1284   bool progress = false;
1285   int if_depth = 0;
1286   int loop_depth = 0;
1287
1288   foreach_list_safe(node, &this->instructions) {
1289      fs_inst *inst = (fs_inst *)node;
1290
1291      /* Make sure that we dominate the instructions we're going to
1292       * scan for interfering with our coalescing, or we won't have
1293       * scanned enough to see if anything interferes with our
1294       * coalescing.  We don't dominate the following instructions if
1295       * we're in a loop or an if block.
1296       */
1297      switch (inst->opcode) {
1298      case BRW_OPCODE_DO:
1299	 loop_depth++;
1300	 break;
1301      case BRW_OPCODE_WHILE:
1302	 loop_depth--;
1303	 break;
1304      case BRW_OPCODE_IF:
1305	 if_depth++;
1306	 break;
1307      case BRW_OPCODE_ENDIF:
1308	 if_depth--;
1309	 break;
1310      default:
1311	 break;
1312      }
1313      if (loop_depth || if_depth)
1314	 continue;
1315
1316      if (inst->opcode != BRW_OPCODE_MOV ||
1317	  inst->predicated ||
1318	  inst->saturate ||
1319	  inst->dst.file != GRF || (inst->src[0].file != GRF &&
1320				    inst->src[0].file != UNIFORM)||
1321	  inst->dst.type != inst->src[0].type)
1322	 continue;
1323
1324      bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1325
1326      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1327       * them: check for no writes to either one until the exit of the
1328       * program.
1329       */
1330      bool interfered = false;
1331
1332      for (fs_inst *scan_inst = (fs_inst *)inst->next;
1333	   !scan_inst->is_tail_sentinel();
1334	   scan_inst = (fs_inst *)scan_inst->next) {
1335	 if (scan_inst->dst.file == GRF) {
1336	    if (scan_inst->dst.reg == inst->dst.reg &&
1337		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1338		 scan_inst->is_tex())) {
1339	       interfered = true;
1340	       break;
1341	    }
1342	    if (inst->src[0].file == GRF &&
1343		scan_inst->dst.reg == inst->src[0].reg &&
1344		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
1345		 scan_inst->is_tex())) {
1346	       interfered = true;
1347	       break;
1348	    }
1349	 }
1350
1351	 /* The gen6 MATH instruction can't handle source modifiers or
1352	  * unusual register regions, so avoid coalescing those for
1353	  * now.  We should do something more specific.
1354	  */
1355	 if (intel->gen >= 6 &&
1356	     scan_inst->is_math() &&
1357	     (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1358	    interfered = true;
1359	    break;
1360	 }
1361
1362	 /* The accumulator result appears to get used for the
1363	  * conditional modifier generation.  When negating a UD
1364	  * value, there is a 33rd bit generated for the sign in the
1365	  * accumulator value, so now you can't check, for example,
1366	  * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1367	  */
1368	 if (scan_inst->conditional_mod &&
1369	     inst->src[0].negate &&
1370	     inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1371	    interfered = true;
1372	    break;
1373	 }
1374      }
1375      if (interfered) {
1376	 continue;
1377      }
1378
1379      /* Rewrite the later usage to point at the source of the move to
1380       * be removed.
1381       */
1382      for (fs_inst *scan_inst = inst;
1383	   !scan_inst->is_tail_sentinel();
1384	   scan_inst = (fs_inst *)scan_inst->next) {
1385	 for (int i = 0; i < 3; i++) {
1386	    if (scan_inst->src[i].file == GRF &&
1387		scan_inst->src[i].reg == inst->dst.reg &&
1388		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1389	       fs_reg new_src = inst->src[0];
1390               if (scan_inst->src[i].abs) {
1391                  new_src.negate = 0;
1392                  new_src.abs = 1;
1393               }
1394	       new_src.negate ^= scan_inst->src[i].negate;
1395	       scan_inst->src[i] = new_src;
1396	    }
1397	 }
1398      }
1399
1400      inst->remove();
1401      progress = true;
1402   }
1403
1404   if (progress)
1405      live_intervals_valid = false;
1406
1407   return progress;
1408}
1409
1410
1411bool
1412fs_visitor::compute_to_mrf()
1413{
1414   bool progress = false;
1415   int next_ip = 0;
1416
1417   calculate_live_intervals();
1418
1419   foreach_list_safe(node, &this->instructions) {
1420      fs_inst *inst = (fs_inst *)node;
1421
1422      int ip = next_ip;
1423      next_ip++;
1424
1425      if (inst->opcode != BRW_OPCODE_MOV ||
1426	  inst->predicated ||
1427	  inst->dst.file != MRF || inst->src[0].file != GRF ||
1428	  inst->dst.type != inst->src[0].type ||
1429	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1430	 continue;
1431
1432      /* Work out which hardware MRF registers are written by this
1433       * instruction.
1434       */
1435      int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1436      int mrf_high;
1437      if (inst->dst.reg & BRW_MRF_COMPR4) {
1438	 mrf_high = mrf_low + 4;
1439      } else if (c->dispatch_width == 16 &&
1440		 (!inst->force_uncompressed && !inst->force_sechalf)) {
1441	 mrf_high = mrf_low + 1;
1442      } else {
1443	 mrf_high = mrf_low;
1444      }
1445
1446      /* Can't compute-to-MRF this GRF if someone else was going to
1447       * read it later.
1448       */
1449      if (this->virtual_grf_use[inst->src[0].reg] > ip)
1450	 continue;
1451
1452      /* Found a move of a GRF to a MRF.  Let's see if we can go
1453       * rewrite the thing that made this GRF to write into the MRF.
1454       */
1455      fs_inst *scan_inst;
1456      for (scan_inst = (fs_inst *)inst->prev;
1457	   scan_inst->prev != NULL;
1458	   scan_inst = (fs_inst *)scan_inst->prev) {
1459	 if (scan_inst->dst.file == GRF &&
1460	     scan_inst->dst.reg == inst->src[0].reg) {
1461	    /* Found the last thing to write our reg we want to turn
1462	     * into a compute-to-MRF.
1463	     */
1464
1465	    if (scan_inst->is_tex()) {
1466	       /* texturing writes several continuous regs, so we can't
1467		* compute-to-mrf that.
1468		*/
1469	       break;
1470	    }
1471
1472	    /* If it's predicated, it (probably) didn't populate all
1473	     * the channels.  We might be able to rewrite everything
1474	     * that writes that reg, but it would require smarter
1475	     * tracking to delay the rewriting until complete success.
1476	     */
1477	    if (scan_inst->predicated)
1478	       break;
1479
1480	    /* If it's half of register setup and not the same half as
1481	     * our MOV we're trying to remove, bail for now.
1482	     */
1483	    if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1484		scan_inst->force_sechalf != inst->force_sechalf) {
1485	       break;
1486	    }
1487
1488	    /* SEND instructions can't have MRF as a destination. */
1489	    if (scan_inst->mlen)
1490	       break;
1491
1492	    if (intel->gen >= 6) {
1493	       /* gen6 math instructions must have the destination be
1494		* GRF, so no compute-to-MRF for them.
1495		*/
1496	       if (scan_inst->is_math()) {
1497		  break;
1498	       }
1499	    }
1500
1501	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1502	       /* Found the creator of our MRF's source value. */
1503	       scan_inst->dst.file = MRF;
1504	       scan_inst->dst.reg = inst->dst.reg;
1505	       scan_inst->saturate |= inst->saturate;
1506	       inst->remove();
1507	       progress = true;
1508	    }
1509	    break;
1510	 }
1511
1512	 /* We don't handle flow control here.  Most computation of
1513	  * values that end up in MRFs are shortly before the MRF
1514	  * write anyway.
1515	  */
1516	 if (scan_inst->opcode == BRW_OPCODE_DO ||
1517	     scan_inst->opcode == BRW_OPCODE_WHILE ||
1518	     scan_inst->opcode == BRW_OPCODE_ELSE ||
1519	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
1520	    break;
1521	 }
1522
1523	 /* You can't read from an MRF, so if someone else reads our
1524	  * MRF's source GRF that we wanted to rewrite, that stops us.
1525	  */
1526	 bool interfered = false;
1527	 for (int i = 0; i < 3; i++) {
1528	    if (scan_inst->src[i].file == GRF &&
1529		scan_inst->src[i].reg == inst->src[0].reg &&
1530		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1531	       interfered = true;
1532	    }
1533	 }
1534	 if (interfered)
1535	    break;
1536
1537	 if (scan_inst->dst.file == MRF) {
1538	    /* If somebody else writes our MRF here, we can't
1539	     * compute-to-MRF before that.
1540	     */
1541	    int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1542	    int scan_mrf_high;
1543
1544	    if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1545	       scan_mrf_high = scan_mrf_low + 4;
1546	    } else if (c->dispatch_width == 16 &&
1547		       (!scan_inst->force_uncompressed &&
1548			!scan_inst->force_sechalf)) {
1549	       scan_mrf_high = scan_mrf_low + 1;
1550	    } else {
1551	       scan_mrf_high = scan_mrf_low;
1552	    }
1553
1554	    if (mrf_low == scan_mrf_low ||
1555		mrf_low == scan_mrf_high ||
1556		mrf_high == scan_mrf_low ||
1557		mrf_high == scan_mrf_high) {
1558	       break;
1559	    }
1560	 }
1561
1562	 if (scan_inst->mlen > 0) {
1563	    /* Found a SEND instruction, which means that there are
1564	     * live values in MRFs from base_mrf to base_mrf +
1565	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
1566	     * above it.
1567	     */
1568	    if (mrf_low >= scan_inst->base_mrf &&
1569		mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1570	       break;
1571	    }
1572	    if (mrf_high >= scan_inst->base_mrf &&
1573		mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1574	       break;
1575	    }
1576	 }
1577      }
1578   }
1579
1580   return progress;
1581}
1582
1583/**
1584 * Walks through basic blocks, looking for repeated MRF writes and
1585 * removing the later ones.
1586 */
1587bool
1588fs_visitor::remove_duplicate_mrf_writes()
1589{
1590   fs_inst *last_mrf_move[16];
1591   bool progress = false;
1592
1593   /* Need to update the MRF tracking for compressed instructions. */
1594   if (c->dispatch_width == 16)
1595      return false;
1596
1597   memset(last_mrf_move, 0, sizeof(last_mrf_move));
1598
1599   foreach_list_safe(node, &this->instructions) {
1600      fs_inst *inst = (fs_inst *)node;
1601
1602      switch (inst->opcode) {
1603      case BRW_OPCODE_DO:
1604      case BRW_OPCODE_WHILE:
1605      case BRW_OPCODE_IF:
1606      case BRW_OPCODE_ELSE:
1607      case BRW_OPCODE_ENDIF:
1608	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1609	 continue;
1610      default:
1611	 break;
1612      }
1613
1614      if (inst->opcode == BRW_OPCODE_MOV &&
1615	  inst->dst.file == MRF) {
1616	 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1617	 if (prev_inst && inst->equals(prev_inst)) {
1618	    inst->remove();
1619	    progress = true;
1620	    continue;
1621	 }
1622      }
1623
1624      /* Clear out the last-write records for MRFs that were overwritten. */
1625      if (inst->dst.file == MRF) {
1626	 last_mrf_move[inst->dst.reg] = NULL;
1627      }
1628
1629      if (inst->mlen > 0) {
1630	 /* Found a SEND instruction, which will include two or fewer
1631	  * implied MRF writes.  We could do better here.
1632	  */
1633	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1634	    last_mrf_move[inst->base_mrf + i] = NULL;
1635	 }
1636      }
1637
1638      /* Clear out any MRF move records whose sources got overwritten. */
1639      if (inst->dst.file == GRF) {
1640	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1641	    if (last_mrf_move[i] &&
1642		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1643	       last_mrf_move[i] = NULL;
1644	    }
1645	 }
1646      }
1647
1648      if (inst->opcode == BRW_OPCODE_MOV &&
1649	  inst->dst.file == MRF &&
1650	  inst->src[0].file == GRF &&
1651	  !inst->predicated) {
1652	 last_mrf_move[inst->dst.reg] = inst;
1653      }
1654   }
1655
1656   return progress;
1657}
1658
1659/**
1660 * Possibly returns an instruction that set up @param reg.
1661 *
1662 * Sometimes we want to take the result of some expression/variable
1663 * dereference tree and rewrite the instruction generating the result
1664 * of the tree.  When processing the tree, we know that the
1665 * instructions generated are all writing temporaries that are dead
1666 * outside of this tree.  So, if we have some instructions that write
1667 * a temporary, we're free to point that temp write somewhere else.
1668 *
1669 * Note that this doesn't guarantee that the instruction generated
1670 * only reg -- it might be the size=4 destination of a texture instruction.
1671 */
1672fs_inst *
1673fs_visitor::get_instruction_generating_reg(fs_inst *start,
1674					   fs_inst *end,
1675					   fs_reg reg)
1676{
1677   if (end == start ||
1678       end->predicated ||
1679       end->force_uncompressed ||
1680       end->force_sechalf ||
1681       !reg.equals(end->dst)) {
1682      return NULL;
1683   } else {
1684      return end;
1685   }
1686}
1687
1688bool
1689fs_visitor::run()
1690{
1691   uint32_t prog_offset_16 = 0;
1692   uint32_t orig_nr_params = c->prog_data.nr_params;
1693
1694   brw_wm_payload_setup(brw, c);
1695
1696   if (c->dispatch_width == 16) {
1697      /* align to 64 byte boundary. */
1698      while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
1699	 brw_NOP(p);
1700      }
1701
1702      /* Save off the start of this 16-wide program in case we succeed. */
1703      prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
1704
1705      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1706   }
1707
1708   if (0) {
1709      emit_dummy_fs();
1710   } else {
1711      calculate_urb_setup();
1712      if (intel->gen < 6)
1713	 emit_interpolation_setup_gen4();
1714      else
1715	 emit_interpolation_setup_gen6();
1716
1717      /* Generate FS IR for main().  (the visitor only descends into
1718       * functions called "main").
1719       */
1720      foreach_list(node, &*shader->ir) {
1721	 ir_instruction *ir = (ir_instruction *)node;
1722	 base_ir = ir;
1723	 this->result = reg_undef;
1724	 ir->accept(this);
1725      }
1726      if (failed)
1727	 return false;
1728
1729      emit_fb_writes();
1730
1731      split_virtual_grfs();
1732
1733      setup_paramvalues_refs();
1734      setup_pull_constants();
1735
1736      bool progress;
1737      do {
1738	 progress = false;
1739
1740	 progress = remove_duplicate_mrf_writes() || progress;
1741
1742	 progress = propagate_constants() || progress;
1743	 progress = opt_algebraic() || progress;
1744	 progress = opt_cse() || progress;
1745	 progress = opt_copy_propagate() || progress;
1746	 progress = register_coalesce() || progress;
1747	 progress = register_coalesce_2() || progress;
1748	 progress = compute_to_mrf() || progress;
1749	 progress = dead_code_eliminate() || progress;
1750      } while (progress);
1751
1752      remove_dead_constants();
1753
1754      schedule_instructions();
1755
1756      assign_curb_setup();
1757      assign_urb_setup();
1758
1759      if (0) {
1760	 /* Debug of register spilling: Go spill everything. */
1761	 int virtual_grf_count = virtual_grf_next;
1762	 for (int i = 0; i < virtual_grf_count; i++) {
1763	    spill_reg(i);
1764	 }
1765      }
1766
1767      if (0)
1768	 assign_regs_trivial();
1769      else {
1770	 while (!assign_regs()) {
1771	    if (failed)
1772	       break;
1773	 }
1774      }
1775   }
1776   assert(force_uncompressed_stack == 0);
1777   assert(force_sechalf_stack == 0);
1778
1779   if (failed)
1780      return false;
1781
1782   generate_code();
1783
1784   if (c->dispatch_width == 8) {
1785      c->prog_data.reg_blocks = brw_register_blocks(grf_used);
1786   } else {
1787      c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
1788      c->prog_data.prog_offset_16 = prog_offset_16;
1789
1790      /* Make sure we didn't try to sneak in an extra uniform */
1791      assert(orig_nr_params == c->prog_data.nr_params);
1792      (void) orig_nr_params;
1793   }
1794
1795   return !failed;
1796}
1797
1798bool
1799brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
1800	       struct gl_shader_program *prog)
1801{
1802   struct intel_context *intel = &brw->intel;
1803
1804   if (!prog)
1805      return false;
1806
1807   struct brw_shader *shader =
1808     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
1809   if (!shader)
1810      return false;
1811
1812   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1813      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
1814      _mesa_print_ir(shader->ir, NULL);
1815      printf("\n\n");
1816   }
1817
1818   /* Now the main event: Visit the shader IR and generate our FS IR for it.
1819    */
1820   c->dispatch_width = 8;
1821
1822   fs_visitor v(c, prog, shader);
1823   if (!v.run()) {
1824      prog->LinkStatus = false;
1825      ralloc_strcat(&prog->InfoLog, v.fail_msg);
1826
1827      _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
1828		    v.fail_msg);
1829
1830      return false;
1831   }
1832
1833   if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
1834      c->dispatch_width = 16;
1835      fs_visitor v2(c, prog, shader);
1836      v2.import_uniforms(&v);
1837      v2.run();
1838   }
1839
1840   c->prog_data.dispatch_width = 8;
1841
1842   return true;
1843}
1844
1845bool
1846brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
1847{
1848   struct brw_context *brw = brw_context(ctx);
1849   struct brw_wm_prog_key key;
1850
1851   /* As a temporary measure we assume that all programs use dFdy() (and hence
1852    * need to be compiled differently depending on whether we're rendering to
1853    * an FBO).  FIXME: set this bool correctly based on the contents of the
1854    * program.
1855    */
1856   bool program_uses_dfdy = true;
1857
1858   if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
1859      return true;
1860
1861   struct gl_fragment_program *fp = (struct gl_fragment_program *)
1862      prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
1863   struct brw_fragment_program *bfp = brw_fragment_program(fp);
1864
1865   memset(&key, 0, sizeof(key));
1866
1867   if (fp->UsesKill)
1868      key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
1869
1870   if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
1871      key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
1872
1873   /* Just assume depth testing. */
1874   key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
1875   key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
1876
1877   key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
1878   for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1879      if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
1880	 continue;
1881
1882      key.proj_attrib_mask |= 1 << i;
1883
1884      int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1885
1886      if (vp_index >= 0)
1887	 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
1888   }
1889
1890   key.clamp_fragment_color = true;
1891
1892   for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) {
1893      if (fp->Base.ShadowSamplers & (1 << i))
1894	 key.tex.compare_funcs[i] = GL_LESS;
1895
1896      /* FINISHME: depth compares might use (0,0,0,W) for example */
1897      key.tex.swizzles[i] = SWIZZLE_XYZW;
1898   }
1899
1900   if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
1901      key.drawable_height = ctx->DrawBuffer->Height;
1902   }
1903
1904   if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
1905      key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
1906   }
1907
1908   key.nr_color_regions = 1;
1909
1910   key.program_string_id = bfp->id;
1911
1912   uint32_t old_prog_offset = brw->wm.prog_offset;
1913   struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
1914
1915   bool success = do_wm_prog(brw, prog, bfp, &key);
1916
1917   brw->wm.prog_offset = old_prog_offset;
1918   brw->wm.prog_data = old_prog_data;
1919
1920   return success;
1921}
1922