brw_vec4.cpp revision 0163c99e8f6959b5d6c7a937a322127cfdf9315f
1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_vec4.h"
25#include "brw_fs.h"
26#include "brw_cfg.h"
27#include "brw_vs.h"
28#include "brw_nir.h"
29#include "brw_vec4_live_variables.h"
30#include "brw_dead_control_flow.h"
31
32extern "C" {
33#include "main/macros.h"
34#include "main/shaderobj.h"
35#include "program/prog_print.h"
36#include "program/prog_parameter.h"
37}
38#include "main/context.h"
39
40#define MAX_INSTRUCTION (1 << 30)
41
42using namespace brw;
43
44namespace brw {
45
46void
47src_reg::init()
48{
49   memset(this, 0, sizeof(*this));
50
51   this->file = BAD_FILE;
52}
53
54src_reg::src_reg(register_file file, int reg, const glsl_type *type)
55{
56   init();
57
58   this->file = file;
59   this->reg = reg;
60   if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
61      this->swizzle = brw_swizzle_for_size(type->vector_elements);
62   else
63      this->swizzle = BRW_SWIZZLE_XYZW;
64}
65
66/** Generic unset register constructor. */
67src_reg::src_reg()
68{
69   init();
70}
71
72src_reg::src_reg(float f)
73{
74   init();
75
76   this->file = IMM;
77   this->type = BRW_REGISTER_TYPE_F;
78   this->fixed_hw_reg.dw1.f = f;
79}
80
81src_reg::src_reg(uint32_t u)
82{
83   init();
84
85   this->file = IMM;
86   this->type = BRW_REGISTER_TYPE_UD;
87   this->fixed_hw_reg.dw1.ud = u;
88}
89
90src_reg::src_reg(int32_t i)
91{
92   init();
93
94   this->file = IMM;
95   this->type = BRW_REGISTER_TYPE_D;
96   this->fixed_hw_reg.dw1.d = i;
97}
98
99src_reg::src_reg(uint8_t vf[4])
100{
101   init();
102
103   this->file = IMM;
104   this->type = BRW_REGISTER_TYPE_VF;
105   memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
106}
107
108src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
109{
110   init();
111
112   this->file = IMM;
113   this->type = BRW_REGISTER_TYPE_VF;
114   this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
115                               (vf1 <<  8) |
116                               (vf2 << 16) |
117                               (vf3 << 24);
118}
119
120src_reg::src_reg(struct brw_reg reg)
121{
122   init();
123
124   this->file = HW_REG;
125   this->fixed_hw_reg = reg;
126   this->type = reg.type;
127}
128
129src_reg::src_reg(const dst_reg &reg)
130{
131   init();
132
133   this->file = reg.file;
134   this->reg = reg.reg;
135   this->reg_offset = reg.reg_offset;
136   this->type = reg.type;
137   this->reladdr = reg.reladdr;
138   this->fixed_hw_reg = reg.fixed_hw_reg;
139   this->swizzle = brw_swizzle_for_mask(reg.writemask);
140}
141
142void
143dst_reg::init()
144{
145   memset(this, 0, sizeof(*this));
146   this->file = BAD_FILE;
147   this->writemask = WRITEMASK_XYZW;
148}
149
150dst_reg::dst_reg()
151{
152   init();
153}
154
155dst_reg::dst_reg(register_file file, int reg)
156{
157   init();
158
159   this->file = file;
160   this->reg = reg;
161}
162
163dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
164                 unsigned writemask)
165{
166   init();
167
168   this->file = file;
169   this->reg = reg;
170   this->type = brw_type_for_base_type(type);
171   this->writemask = writemask;
172}
173
174dst_reg::dst_reg(struct brw_reg reg)
175{
176   init();
177
178   this->file = HW_REG;
179   this->fixed_hw_reg = reg;
180   this->type = reg.type;
181}
182
183dst_reg::dst_reg(const src_reg &reg)
184{
185   init();
186
187   this->file = reg.file;
188   this->reg = reg.reg;
189   this->reg_offset = reg.reg_offset;
190   this->type = reg.type;
191   this->writemask = brw_mask_for_swizzle(reg.swizzle);
192   this->reladdr = reg.reladdr;
193   this->fixed_hw_reg = reg.fixed_hw_reg;
194}
195
196bool
197dst_reg::equals(const dst_reg &r) const
198{
199   return (file == r.file &&
200           reg == r.reg &&
201           reg_offset == r.reg_offset &&
202           type == r.type &&
203           negate == r.negate &&
204           abs == r.abs &&
205           writemask == r.writemask &&
206           (reladdr == r.reladdr ||
207            (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) &&
208           memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
209                  sizeof(fixed_hw_reg)) == 0);
210}
211
212bool
213vec4_instruction::is_send_from_grf()
214{
215   switch (opcode) {
216   case SHADER_OPCODE_SHADER_TIME_ADD:
217   case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
218   case SHADER_OPCODE_UNTYPED_ATOMIC:
219   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
220   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
221   case SHADER_OPCODE_TYPED_ATOMIC:
222   case SHADER_OPCODE_TYPED_SURFACE_READ:
223   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
224      return true;
225   default:
226      return false;
227   }
228}
229
230unsigned
231vec4_instruction::regs_read(unsigned arg) const
232{
233   if (src[arg].file == BAD_FILE)
234      return 0;
235
236   switch (opcode) {
237   case SHADER_OPCODE_SHADER_TIME_ADD:
238   case SHADER_OPCODE_UNTYPED_ATOMIC:
239   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
240   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
241   case SHADER_OPCODE_TYPED_ATOMIC:
242   case SHADER_OPCODE_TYPED_SURFACE_READ:
243   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
244      return arg == 0 ? mlen : 1;
245
246   case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
247      return arg == 1 ? mlen : 1;
248
249   default:
250      return 1;
251   }
252}
253
254bool
255vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo)
256{
257   if (devinfo->gen == 6 && is_math())
258      return false;
259
260   if (is_send_from_grf())
261      return false;
262
263   if (!backend_instruction::can_do_source_mods())
264      return false;
265
266   return true;
267}
268
269/**
270 * Returns how many MRFs an opcode will write over.
271 *
272 * Note that this is not the 0 or 1 implied writes in an actual gen
273 * instruction -- the generate_* functions generate additional MOVs
274 * for setup.
275 */
276int
277vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
278{
279   if (inst->mlen == 0 || inst->is_send_from_grf())
280      return 0;
281
282   switch (inst->opcode) {
283   case SHADER_OPCODE_RCP:
284   case SHADER_OPCODE_RSQ:
285   case SHADER_OPCODE_SQRT:
286   case SHADER_OPCODE_EXP2:
287   case SHADER_OPCODE_LOG2:
288   case SHADER_OPCODE_SIN:
289   case SHADER_OPCODE_COS:
290      return 1;
291   case SHADER_OPCODE_INT_QUOTIENT:
292   case SHADER_OPCODE_INT_REMAINDER:
293   case SHADER_OPCODE_POW:
294      return 2;
295   case VS_OPCODE_URB_WRITE:
296      return 1;
297   case VS_OPCODE_PULL_CONSTANT_LOAD:
298      return 2;
299   case SHADER_OPCODE_GEN4_SCRATCH_READ:
300      return 2;
301   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
302      return 3;
303   case GS_OPCODE_URB_WRITE:
304   case GS_OPCODE_URB_WRITE_ALLOCATE:
305   case GS_OPCODE_THREAD_END:
306      return 0;
307   case GS_OPCODE_FF_SYNC:
308      return 1;
309   case SHADER_OPCODE_SHADER_TIME_ADD:
310      return 0;
311   case SHADER_OPCODE_TEX:
312   case SHADER_OPCODE_TXL:
313   case SHADER_OPCODE_TXD:
314   case SHADER_OPCODE_TXF:
315   case SHADER_OPCODE_TXF_CMS:
316   case SHADER_OPCODE_TXF_MCS:
317   case SHADER_OPCODE_TXS:
318   case SHADER_OPCODE_TG4:
319   case SHADER_OPCODE_TG4_OFFSET:
320      return inst->header_size;
321   default:
322      unreachable("not reached");
323   }
324}
325
326bool
327src_reg::equals(const src_reg &r) const
328{
329   return (file == r.file &&
330	   reg == r.reg &&
331	   reg_offset == r.reg_offset &&
332	   type == r.type &&
333	   negate == r.negate &&
334	   abs == r.abs &&
335	   swizzle == r.swizzle &&
336	   !reladdr && !r.reladdr &&
337	   memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
338		  sizeof(fixed_hw_reg)) == 0);
339}
340
341bool
342vec4_visitor::opt_vector_float()
343{
344   bool progress = false;
345
346   int last_reg = -1, last_reg_offset = -1;
347   enum register_file last_reg_file = BAD_FILE;
348
349   int remaining_channels = 0;
350   uint8_t imm[4];
351   int inst_count = 0;
352   vec4_instruction *imm_inst[4];
353
354   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
355      if (last_reg != inst->dst.reg ||
356          last_reg_offset != inst->dst.reg_offset ||
357          last_reg_file != inst->dst.file) {
358         last_reg = inst->dst.reg;
359         last_reg_offset = inst->dst.reg_offset;
360         last_reg_file = inst->dst.file;
361         remaining_channels = WRITEMASK_XYZW;
362
363         inst_count = 0;
364      }
365
366      if (inst->opcode != BRW_OPCODE_MOV ||
367          inst->dst.writemask == WRITEMASK_XYZW ||
368          inst->src[0].file != IMM)
369         continue;
370
371      int vf = brw_float_to_vf(inst->src[0].fixed_hw_reg.dw1.f);
372      if (vf == -1)
373         continue;
374
375      if ((inst->dst.writemask & WRITEMASK_X) != 0)
376         imm[0] = vf;
377      if ((inst->dst.writemask & WRITEMASK_Y) != 0)
378         imm[1] = vf;
379      if ((inst->dst.writemask & WRITEMASK_Z) != 0)
380         imm[2] = vf;
381      if ((inst->dst.writemask & WRITEMASK_W) != 0)
382         imm[3] = vf;
383
384      imm_inst[inst_count++] = inst;
385
386      remaining_channels &= ~inst->dst.writemask;
387      if (remaining_channels == 0) {
388         vec4_instruction *mov = MOV(inst->dst, imm);
389         mov->dst.type = BRW_REGISTER_TYPE_F;
390         mov->dst.writemask = WRITEMASK_XYZW;
391         inst->insert_after(block, mov);
392         last_reg = -1;
393
394         for (int i = 0; i < inst_count; i++) {
395            imm_inst[i]->remove(block);
396         }
397         progress = true;
398      }
399   }
400
401   if (progress)
402      invalidate_live_intervals();
403
404   return progress;
405}
406
407/* Replaces unused channels of a swizzle with channels that are used.
408 *
409 * For instance, this pass transforms
410 *
411 *    mov vgrf4.yz, vgrf5.wxzy
412 *
413 * into
414 *
415 *    mov vgrf4.yz, vgrf5.xxzx
416 *
417 * This eliminates false uses of some channels, letting dead code elimination
418 * remove the instructions that wrote them.
419 */
420bool
421vec4_visitor::opt_reduce_swizzle()
422{
423   bool progress = false;
424
425   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
426      if (inst->dst.file == BAD_FILE || inst->dst.file == HW_REG ||
427          inst->is_send_from_grf())
428         continue;
429
430      unsigned swizzle;
431
432      /* Determine which channels of the sources are read. */
433      switch (inst->opcode) {
434      case VEC4_OPCODE_PACK_BYTES:
435      case BRW_OPCODE_DP4:
436      case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
437                            *           but all four of src1.
438                            */
439         swizzle = brw_swizzle_for_size(4);
440         break;
441      case BRW_OPCODE_DP3:
442         swizzle = brw_swizzle_for_size(3);
443         break;
444      case BRW_OPCODE_DP2:
445         swizzle = brw_swizzle_for_size(2);
446         break;
447      default:
448         swizzle = brw_swizzle_for_mask(inst->dst.writemask);
449         break;
450      }
451
452      /* Update sources' swizzles. */
453      for (int i = 0; i < 3; i++) {
454         if (inst->src[i].file != GRF &&
455             inst->src[i].file != ATTR &&
456             inst->src[i].file != UNIFORM)
457            continue;
458
459         const unsigned new_swizzle =
460            brw_compose_swizzle(swizzle, inst->src[i].swizzle);
461         if (inst->src[i].swizzle != new_swizzle) {
462            inst->src[i].swizzle = new_swizzle;
463            progress = true;
464         }
465      }
466   }
467
468   if (progress)
469      invalidate_live_intervals();
470
471   return progress;
472}
473
474void
475vec4_visitor::split_uniform_registers()
476{
477   /* Prior to this, uniforms have been in an array sized according to
478    * the number of vector uniforms present, sparsely filled (so an
479    * aggregate results in reg indices being skipped over).  Now we're
480    * going to cut those aggregates up so each .reg index is one
481    * vector.  The goal is to make elimination of unused uniform
482    * components easier later.
483    */
484   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
485      for (int i = 0 ; i < 3; i++) {
486	 if (inst->src[i].file != UNIFORM)
487	    continue;
488
489	 assert(!inst->src[i].reladdr);
490
491	 inst->src[i].reg += inst->src[i].reg_offset;
492	 inst->src[i].reg_offset = 0;
493      }
494   }
495
496   /* Update that everything is now vector-sized. */
497   for (int i = 0; i < this->uniforms; i++) {
498      this->uniform_size[i] = 1;
499   }
500}
501
502void
503vec4_visitor::pack_uniform_registers()
504{
505   bool uniform_used[this->uniforms];
506   int new_loc[this->uniforms];
507   int new_chan[this->uniforms];
508
509   memset(uniform_used, 0, sizeof(uniform_used));
510   memset(new_loc, 0, sizeof(new_loc));
511   memset(new_chan, 0, sizeof(new_chan));
512
513   /* Find which uniform vectors are actually used by the program.  We
514    * expect unused vector elements when we've moved array access out
515    * to pull constants, and from some GLSL code generators like wine.
516    */
517   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
518      for (int i = 0 ; i < 3; i++) {
519	 if (inst->src[i].file != UNIFORM)
520	    continue;
521
522	 uniform_used[inst->src[i].reg] = true;
523      }
524   }
525
526   int new_uniform_count = 0;
527
528   /* Now, figure out a packing of the live uniform vectors into our
529    * push constants.
530    */
531   for (int src = 0; src < uniforms; src++) {
532      assert(src < uniform_array_size);
533      int size = this->uniform_vector_size[src];
534
535      if (!uniform_used[src]) {
536	 this->uniform_vector_size[src] = 0;
537	 continue;
538      }
539
540      int dst;
541      /* Find the lowest place we can slot this uniform in. */
542      for (dst = 0; dst < src; dst++) {
543	 if (this->uniform_vector_size[dst] + size <= 4)
544	    break;
545      }
546
547      if (src == dst) {
548	 new_loc[src] = dst;
549	 new_chan[src] = 0;
550      } else {
551	 new_loc[src] = dst;
552	 new_chan[src] = this->uniform_vector_size[dst];
553
554	 /* Move the references to the data */
555	 for (int j = 0; j < size; j++) {
556	    stage_prog_data->param[dst * 4 + new_chan[src] + j] =
557	       stage_prog_data->param[src * 4 + j];
558	 }
559
560	 this->uniform_vector_size[dst] += size;
561	 this->uniform_vector_size[src] = 0;
562      }
563
564      new_uniform_count = MAX2(new_uniform_count, dst + 1);
565   }
566
567   this->uniforms = new_uniform_count;
568
569   /* Now, update the instructions for our repacked uniforms. */
570   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
571      for (int i = 0 ; i < 3; i++) {
572	 int src = inst->src[i].reg;
573
574	 if (inst->src[i].file != UNIFORM)
575	    continue;
576
577	 inst->src[i].reg = new_loc[src];
578         inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
579                                              new_chan[src], new_chan[src]);
580      }
581   }
582}
583
584/**
585 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
586 *
587 * While GLSL IR also performs this optimization, we end up with it in
588 * our instruction stream for a couple of reasons.  One is that we
589 * sometimes generate silly instructions, for example in array access
590 * where we'll generate "ADD offset, index, base" even if base is 0.
591 * The other is that GLSL IR's constant propagation doesn't track the
592 * components of aggregates, so some VS patterns (initialize matrix to
593 * 0, accumulate in vertex blending factors) end up breaking down to
594 * instructions involving 0.
595 */
596bool
597vec4_visitor::opt_algebraic()
598{
599   bool progress = false;
600
601   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
602      switch (inst->opcode) {
603      case BRW_OPCODE_MOV:
604         if (inst->src[0].file != IMM)
605            break;
606
607         if (inst->saturate) {
608            if (inst->dst.type != inst->src[0].type)
609               assert(!"unimplemented: saturate mixed types");
610
611            if (brw_saturate_immediate(inst->dst.type,
612                                       &inst->src[0].fixed_hw_reg)) {
613               inst->saturate = false;
614               progress = true;
615            }
616         }
617         break;
618
619      case VEC4_OPCODE_UNPACK_UNIFORM:
620         if (inst->src[0].file != UNIFORM) {
621            inst->opcode = BRW_OPCODE_MOV;
622            progress = true;
623         }
624         break;
625
626      case BRW_OPCODE_ADD:
627	 if (inst->src[1].is_zero()) {
628	    inst->opcode = BRW_OPCODE_MOV;
629	    inst->src[1] = src_reg();
630	    progress = true;
631	 }
632	 break;
633
634      case BRW_OPCODE_MUL:
635	 if (inst->src[1].is_zero()) {
636	    inst->opcode = BRW_OPCODE_MOV;
637	    switch (inst->src[0].type) {
638	    case BRW_REGISTER_TYPE_F:
639	       inst->src[0] = src_reg(0.0f);
640	       break;
641	    case BRW_REGISTER_TYPE_D:
642	       inst->src[0] = src_reg(0);
643	       break;
644	    case BRW_REGISTER_TYPE_UD:
645	       inst->src[0] = src_reg(0u);
646	       break;
647	    default:
648	       unreachable("not reached");
649	    }
650	    inst->src[1] = src_reg();
651	    progress = true;
652	 } else if (inst->src[1].is_one()) {
653	    inst->opcode = BRW_OPCODE_MOV;
654	    inst->src[1] = src_reg();
655	    progress = true;
656         } else if (inst->src[1].is_negative_one()) {
657            inst->opcode = BRW_OPCODE_MOV;
658            inst->src[0].negate = !inst->src[0].negate;
659            inst->src[1] = src_reg();
660            progress = true;
661	 }
662	 break;
663      case BRW_OPCODE_CMP:
664         if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
665             inst->src[0].abs &&
666             inst->src[0].negate &&
667             inst->src[1].is_zero()) {
668            inst->src[0].abs = false;
669            inst->src[0].negate = false;
670            inst->conditional_mod = BRW_CONDITIONAL_Z;
671            progress = true;
672            break;
673         }
674         break;
675      case SHADER_OPCODE_RCP: {
676         vec4_instruction *prev = (vec4_instruction *)inst->prev;
677         if (prev->opcode == SHADER_OPCODE_SQRT) {
678            if (inst->src[0].equals(src_reg(prev->dst))) {
679               inst->opcode = SHADER_OPCODE_RSQ;
680               inst->src[0] = prev->src[0];
681               progress = true;
682            }
683         }
684         break;
685      }
686      case SHADER_OPCODE_BROADCAST:
687         if (is_uniform(inst->src[0]) ||
688             inst->src[1].is_zero()) {
689            inst->opcode = BRW_OPCODE_MOV;
690            inst->src[1] = src_reg();
691            inst->force_writemask_all = true;
692            progress = true;
693         }
694         break;
695
696      default:
697	 break;
698      }
699   }
700
701   if (progress)
702      invalidate_live_intervals();
703
704   return progress;
705}
706
707/**
708 * Only a limited number of hardware registers may be used for push
709 * constants, so this turns access to the overflowed constants into
710 * pull constants.
711 */
712void
713vec4_visitor::move_push_constants_to_pull_constants()
714{
715   int pull_constant_loc[this->uniforms];
716
717   /* Only allow 32 registers (256 uniform components) as push constants,
718    * which is the limit on gen6.
719    *
720    * If changing this value, note the limitation about total_regs in
721    * brw_curbe.c.
722    */
723   int max_uniform_components = 32 * 8;
724   if (this->uniforms * 4 <= max_uniform_components)
725      return;
726
727   /* Make some sort of choice as to which uniforms get sent to pull
728    * constants.  We could potentially do something clever here like
729    * look for the most infrequently used uniform vec4s, but leave
730    * that for later.
731    */
732   for (int i = 0; i < this->uniforms * 4; i += 4) {
733      pull_constant_loc[i / 4] = -1;
734
735      if (i >= max_uniform_components) {
736	 const gl_constant_value **values = &stage_prog_data->param[i];
737
738	 /* Try to find an existing copy of this uniform in the pull
739	  * constants if it was part of an array access already.
740	  */
741	 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
742	    int matches;
743
744	    for (matches = 0; matches < 4; matches++) {
745	       if (stage_prog_data->pull_param[j + matches] != values[matches])
746		  break;
747	    }
748
749	    if (matches == 4) {
750	       pull_constant_loc[i / 4] = j / 4;
751	       break;
752	    }
753	 }
754
755	 if (pull_constant_loc[i / 4] == -1) {
756	    assert(stage_prog_data->nr_pull_params % 4 == 0);
757	    pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
758
759	    for (int j = 0; j < 4; j++) {
760	       stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
761                  values[j];
762	    }
763	 }
764      }
765   }
766
767   /* Now actually rewrite usage of the things we've moved to pull
768    * constants.
769    */
770   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
771      for (int i = 0 ; i < 3; i++) {
772	 if (inst->src[i].file != UNIFORM ||
773	     pull_constant_loc[inst->src[i].reg] == -1)
774	    continue;
775
776	 int uniform = inst->src[i].reg;
777
778	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
779
780	 emit_pull_constant_load(block, inst, temp, inst->src[i],
781				 pull_constant_loc[uniform]);
782
783	 inst->src[i].file = temp.file;
784	 inst->src[i].reg = temp.reg;
785	 inst->src[i].reg_offset = temp.reg_offset;
786	 inst->src[i].reladdr = NULL;
787      }
788   }
789
790   /* Repack push constants to remove the now-unused ones. */
791   pack_uniform_registers();
792}
793
794/* Conditions for which we want to avoid setting the dependency control bits */
795bool
796vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
797{
798#define IS_DWORD(reg) \
799   (reg.type == BRW_REGISTER_TYPE_UD || \
800    reg.type == BRW_REGISTER_TYPE_D)
801
802   /* "When source or destination datatype is 64b or operation is integer DWord
803    * multiply, DepCtrl must not be used."
804    * May apply to future SoCs as well.
805    */
806   if (devinfo->is_cherryview) {
807      if (inst->opcode == BRW_OPCODE_MUL &&
808         IS_DWORD(inst->src[0]) &&
809         IS_DWORD(inst->src[1]))
810         return true;
811   }
812#undef IS_DWORD
813
814   if (devinfo->gen >= 8) {
815      if (inst->opcode == BRW_OPCODE_F32TO16)
816         return true;
817   }
818
819   /*
820    * mlen:
821    * In the presence of send messages, totally interrupt dependency
822    * control. They're long enough that the chance of dependency
823    * control around them just doesn't matter.
824    *
825    * predicate:
826    * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
827    * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
828    * completes the scoreboard clear must have a non-zero execution mask. This
829    * means, if any kind of predication can change the execution mask or channel
830    * enable of the last instruction, the optimization must be avoided. This is
831    * to avoid instructions being shot down the pipeline when no writes are
832    * required.
833    *
834    * math:
835    * Dependency control does not work well over math instructions.
836    * NB: Discovered empirically
837    */
838   return (inst->mlen || inst->predicate || inst->is_math());
839}
840
841/**
842 * Sets the dependency control fields on instructions after register
843 * allocation and before the generator is run.
844 *
845 * When you have a sequence of instructions like:
846 *
847 * DP4 temp.x vertex uniform[0]
848 * DP4 temp.y vertex uniform[0]
849 * DP4 temp.z vertex uniform[0]
850 * DP4 temp.w vertex uniform[0]
851 *
852 * The hardware doesn't know that it can actually run the later instructions
853 * while the previous ones are in flight, producing stalls.  However, we have
854 * manual fields we can set in the instructions that let it do so.
855 */
856void
857vec4_visitor::opt_set_dependency_control()
858{
859   vec4_instruction *last_grf_write[BRW_MAX_GRF];
860   uint8_t grf_channels_written[BRW_MAX_GRF];
861   vec4_instruction *last_mrf_write[BRW_MAX_GRF];
862   uint8_t mrf_channels_written[BRW_MAX_GRF];
863
864   assert(prog_data->total_grf ||
865          !"Must be called after register allocation");
866
867   foreach_block (block, cfg) {
868      memset(last_grf_write, 0, sizeof(last_grf_write));
869      memset(last_mrf_write, 0, sizeof(last_mrf_write));
870
871      foreach_inst_in_block (vec4_instruction, inst, block) {
872         /* If we read from a register that we were doing dependency control
873          * on, don't do dependency control across the read.
874          */
875         for (int i = 0; i < 3; i++) {
876            int reg = inst->src[i].reg + inst->src[i].reg_offset;
877            if (inst->src[i].file == GRF) {
878               last_grf_write[reg] = NULL;
879            } else if (inst->src[i].file == HW_REG) {
880               memset(last_grf_write, 0, sizeof(last_grf_write));
881               break;
882            }
883            assert(inst->src[i].file != MRF);
884         }
885
886         if (is_dep_ctrl_unsafe(inst)) {
887            memset(last_grf_write, 0, sizeof(last_grf_write));
888            memset(last_mrf_write, 0, sizeof(last_mrf_write));
889            continue;
890         }
891
892         /* Now, see if we can do dependency control for this instruction
893          * against a previous one writing to its destination.
894          */
895         int reg = inst->dst.reg + inst->dst.reg_offset;
896         if (inst->dst.file == GRF) {
897            if (last_grf_write[reg] &&
898                !(inst->dst.writemask & grf_channels_written[reg])) {
899               last_grf_write[reg]->no_dd_clear = true;
900               inst->no_dd_check = true;
901            } else {
902               grf_channels_written[reg] = 0;
903            }
904
905            last_grf_write[reg] = inst;
906            grf_channels_written[reg] |= inst->dst.writemask;
907         } else if (inst->dst.file == MRF) {
908            if (last_mrf_write[reg] &&
909                !(inst->dst.writemask & mrf_channels_written[reg])) {
910               last_mrf_write[reg]->no_dd_clear = true;
911               inst->no_dd_check = true;
912            } else {
913               mrf_channels_written[reg] = 0;
914            }
915
916            last_mrf_write[reg] = inst;
917            mrf_channels_written[reg] |= inst->dst.writemask;
918         } else if (inst->dst.reg == HW_REG) {
919            if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE)
920               memset(last_grf_write, 0, sizeof(last_grf_write));
921            if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE)
922               memset(last_mrf_write, 0, sizeof(last_mrf_write));
923         }
924      }
925   }
926}
927
928bool
929vec4_instruction::can_reswizzle(int dst_writemask,
930                                int swizzle,
931                                int swizzle_mask)
932{
933   /* If this instruction sets anything not referenced by swizzle, then we'd
934    * totally break it when we reswizzle.
935    */
936   if (dst.writemask & ~swizzle_mask)
937      return false;
938
939   if (mlen > 0)
940      return false;
941
942   return true;
943}
944
945/**
946 * For any channels in the swizzle's source that were populated by this
947 * instruction, rewrite the instruction to put the appropriate result directly
948 * in those channels.
949 *
950 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
951 */
952void
953vec4_instruction::reswizzle(int dst_writemask, int swizzle)
954{
955   /* Destination write mask doesn't correspond to source swizzle for the dot
956    * product and pack_bytes instructions.
957    */
958   if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
959       opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
960       opcode != VEC4_OPCODE_PACK_BYTES) {
961      for (int i = 0; i < 3; i++) {
962         if (src[i].file == BAD_FILE || src[i].file == IMM)
963            continue;
964
965         src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
966      }
967   }
968
969   /* Apply the specified swizzle and writemask to the original mask of
970    * written components.
971    */
972   dst.writemask = dst_writemask &
973                   brw_apply_swizzle_to_mask(swizzle, dst.writemask);
974}
975
976/*
977 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
978 * just written and then MOVed into another reg and making the original write
979 * of the GRF write directly to the final destination instead.
980 */
981bool
982vec4_visitor::opt_register_coalesce()
983{
984   bool progress = false;
985   int next_ip = 0;
986
987   calculate_live_intervals();
988
989   foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
990      int ip = next_ip;
991      next_ip++;
992
993      if (inst->opcode != BRW_OPCODE_MOV ||
994          (inst->dst.file != GRF && inst->dst.file != MRF) ||
995	  inst->predicate ||
996	  inst->src[0].file != GRF ||
997	  inst->dst.type != inst->src[0].type ||
998	  inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
999	 continue;
1000
1001      bool to_mrf = (inst->dst.file == MRF);
1002
1003      /* Can't coalesce this GRF if someone else was going to
1004       * read it later.
1005       */
1006      if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
1007	 continue;
1008
1009      /* We need to check interference with the final destination between this
1010       * instruction and the earliest instruction involved in writing the GRF
1011       * we're eliminating.  To do that, keep track of which of our source
1012       * channels we've seen initialized.
1013       */
1014      const unsigned chans_needed =
1015         brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1016                                       inst->dst.writemask);
1017      unsigned chans_remaining = chans_needed;
1018
1019      /* Now walk up the instruction stream trying to see if we can rewrite
1020       * everything writing to the temporary to write into the destination
1021       * instead.
1022       */
1023      vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1024      foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1025                                                  inst, block) {
1026         _scan_inst = scan_inst;
1027
1028         if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {
1029            /* Found something writing to the reg we want to coalesce away. */
1030            if (to_mrf) {
1031               /* SEND instructions can't have MRF as a destination. */
1032               if (scan_inst->mlen)
1033                  break;
1034
1035               if (devinfo->gen == 6) {
1036                  /* gen6 math instructions must have the destination be
1037                   * GRF, so no compute-to-MRF for them.
1038                   */
1039                  if (scan_inst->is_math()) {
1040                     break;
1041                  }
1042               }
1043            }
1044
1045            /* If we can't handle the swizzle, bail. */
1046            if (!scan_inst->can_reswizzle(inst->dst.writemask,
1047                                          inst->src[0].swizzle,
1048                                          chans_needed)) {
1049               break;
1050            }
1051
1052            /* This doesn't handle coalescing of multiple registers. */
1053            if (scan_inst->regs_written > 1)
1054               break;
1055
1056	    /* Mark which channels we found unconditional writes for. */
1057	    if (!scan_inst->predicate)
1058               chans_remaining &= ~scan_inst->dst.writemask;
1059
1060	    if (chans_remaining == 0)
1061	       break;
1062	 }
1063
1064         /* You can't read from an MRF, so if someone else reads our MRF's
1065          * source GRF that we wanted to rewrite, that stops us.  If it's a
1066          * GRF we're trying to coalesce to, we don't actually handle
1067          * rewriting sources so bail in that case as well.
1068          */
1069	 bool interfered = false;
1070	 for (int i = 0; i < 3; i++) {
1071            if (inst->src[0].in_range(scan_inst->src[i],
1072                                      scan_inst->regs_read(i)))
1073	       interfered = true;
1074	 }
1075	 if (interfered)
1076	    break;
1077
1078         /* If somebody else writes our destination here, we can't coalesce
1079          * before that.
1080          */
1081         if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written))
1082	    break;
1083
1084         /* Check for reads of the register we're trying to coalesce into.  We
1085          * can't go rewriting instructions above that to put some other value
1086          * in the register instead.
1087          */
1088         if (to_mrf && scan_inst->mlen > 0) {
1089            if (inst->dst.reg >= scan_inst->base_mrf &&
1090                inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
1091               break;
1092            }
1093         } else {
1094            for (int i = 0; i < 3; i++) {
1095               if (inst->dst.in_range(scan_inst->src[i],
1096                                      scan_inst->regs_read(i)))
1097                  interfered = true;
1098            }
1099            if (interfered)
1100               break;
1101         }
1102      }
1103
1104      if (chans_remaining == 0) {
1105	 /* If we've made it here, we have an MOV we want to coalesce out, and
1106	  * a scan_inst pointing to the earliest instruction involved in
1107	  * computing the value.  Now go rewrite the instruction stream
1108	  * between the two.
1109	  */
1110         vec4_instruction *scan_inst = _scan_inst;
1111	 while (scan_inst != inst) {
1112	    if (scan_inst->dst.file == GRF &&
1113		scan_inst->dst.reg == inst->src[0].reg &&
1114		scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1115               scan_inst->reswizzle(inst->dst.writemask,
1116                                    inst->src[0].swizzle);
1117	       scan_inst->dst.file = inst->dst.file;
1118	       scan_inst->dst.reg = inst->dst.reg;
1119	       scan_inst->dst.reg_offset = inst->dst.reg_offset;
1120	       scan_inst->saturate |= inst->saturate;
1121	    }
1122	    scan_inst = (vec4_instruction *)scan_inst->next;
1123	 }
1124	 inst->remove(block);
1125	 progress = true;
1126      }
1127   }
1128
1129   if (progress)
1130      invalidate_live_intervals();
1131
1132   return progress;
1133}
1134
1135/**
1136 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1137 * flow.  We could probably do better here with some form of divergence
1138 * analysis.
1139 */
1140bool
1141vec4_visitor::eliminate_find_live_channel()
1142{
1143   bool progress = false;
1144   unsigned depth = 0;
1145
1146   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1147      switch (inst->opcode) {
1148      case BRW_OPCODE_IF:
1149      case BRW_OPCODE_DO:
1150         depth++;
1151         break;
1152
1153      case BRW_OPCODE_ENDIF:
1154      case BRW_OPCODE_WHILE:
1155         depth--;
1156         break;
1157
1158      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1159         if (depth == 0) {
1160            inst->opcode = BRW_OPCODE_MOV;
1161            inst->src[0] = src_reg(0);
1162            inst->force_writemask_all = true;
1163            progress = true;
1164         }
1165         break;
1166
1167      default:
1168         break;
1169      }
1170   }
1171
1172   return progress;
1173}
1174
1175/**
1176 * Splits virtual GRFs requesting more than one contiguous physical register.
1177 *
1178 * We initially create large virtual GRFs for temporary structures, arrays,
1179 * and matrices, so that the dereference visitor functions can add reg_offsets
1180 * to work their way down to the actual member being accessed.  But when it
1181 * comes to optimization, we'd like to treat each register as individual
1182 * storage if possible.
1183 *
1184 * So far, the only thing that might prevent splitting is a send message from
1185 * a GRF on IVB.
1186 */
1187void
1188vec4_visitor::split_virtual_grfs()
1189{
1190   int num_vars = this->alloc.count;
1191   int new_virtual_grf[num_vars];
1192   bool split_grf[num_vars];
1193
1194   memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1195
1196   /* Try to split anything > 0 sized. */
1197   for (int i = 0; i < num_vars; i++) {
1198      split_grf[i] = this->alloc.sizes[i] != 1;
1199   }
1200
1201   /* Check that the instructions are compatible with the registers we're trying
1202    * to split.
1203    */
1204   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1205      if (inst->dst.file == GRF && inst->regs_written > 1)
1206         split_grf[inst->dst.reg] = false;
1207
1208      for (int i = 0; i < 3; i++) {
1209         if (inst->src[i].file == GRF && inst->regs_read(i) > 1)
1210            split_grf[inst->src[i].reg] = false;
1211      }
1212   }
1213
1214   /* Allocate new space for split regs.  Note that the virtual
1215    * numbers will be contiguous.
1216    */
1217   for (int i = 0; i < num_vars; i++) {
1218      if (!split_grf[i])
1219         continue;
1220
1221      new_virtual_grf[i] = alloc.allocate(1);
1222      for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1223         unsigned reg = alloc.allocate(1);
1224         assert(reg == new_virtual_grf[i] + j - 1);
1225         (void) reg;
1226      }
1227      this->alloc.sizes[i] = 1;
1228   }
1229
1230   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1231      if (inst->dst.file == GRF && split_grf[inst->dst.reg] &&
1232          inst->dst.reg_offset != 0) {
1233         inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1234                          inst->dst.reg_offset - 1);
1235         inst->dst.reg_offset = 0;
1236      }
1237      for (int i = 0; i < 3; i++) {
1238         if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] &&
1239             inst->src[i].reg_offset != 0) {
1240            inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1241                                inst->src[i].reg_offset - 1);
1242            inst->src[i].reg_offset = 0;
1243         }
1244      }
1245   }
1246   invalidate_live_intervals();
1247}
1248
1249void
1250vec4_visitor::dump_instruction(backend_instruction *be_inst)
1251{
1252   dump_instruction(be_inst, stderr);
1253}
1254
1255void
1256vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
1257{
1258   vec4_instruction *inst = (vec4_instruction *)be_inst;
1259
1260   if (inst->predicate) {
1261      fprintf(file, "(%cf0.%d) ",
1262              inst->predicate_inverse ? '-' : '+',
1263              inst->flag_subreg);
1264   }
1265
1266   fprintf(file, "%s", brw_instruction_name(inst->opcode));
1267   if (inst->saturate)
1268      fprintf(file, ".sat");
1269   if (inst->conditional_mod) {
1270      fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
1271      if (!inst->predicate &&
1272          (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
1273                                inst->opcode != BRW_OPCODE_IF &&
1274                                inst->opcode != BRW_OPCODE_WHILE))) {
1275         fprintf(file, ".f0.%d", inst->flag_subreg);
1276      }
1277   }
1278   fprintf(file, " ");
1279
1280   switch (inst->dst.file) {
1281   case GRF:
1282      fprintf(file, "vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
1283      break;
1284   case MRF:
1285      fprintf(file, "m%d", inst->dst.reg);
1286      break;
1287   case HW_REG:
1288      if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1289         switch (inst->dst.fixed_hw_reg.nr) {
1290         case BRW_ARF_NULL:
1291            fprintf(file, "null");
1292            break;
1293         case BRW_ARF_ADDRESS:
1294            fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
1295            break;
1296         case BRW_ARF_ACCUMULATOR:
1297            fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
1298            break;
1299         case BRW_ARF_FLAG:
1300            fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
1301                             inst->dst.fixed_hw_reg.subnr);
1302            break;
1303         default:
1304            fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
1305                               inst->dst.fixed_hw_reg.subnr);
1306            break;
1307         }
1308      } else {
1309         fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
1310      }
1311      if (inst->dst.fixed_hw_reg.subnr)
1312         fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
1313      break;
1314   case BAD_FILE:
1315      fprintf(file, "(null)");
1316      break;
1317   default:
1318      fprintf(file, "???");
1319      break;
1320   }
1321   if (inst->dst.writemask != WRITEMASK_XYZW) {
1322      fprintf(file, ".");
1323      if (inst->dst.writemask & 1)
1324         fprintf(file, "x");
1325      if (inst->dst.writemask & 2)
1326         fprintf(file, "y");
1327      if (inst->dst.writemask & 4)
1328         fprintf(file, "z");
1329      if (inst->dst.writemask & 8)
1330         fprintf(file, "w");
1331   }
1332   fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
1333
1334   if (inst->src[0].file != BAD_FILE)
1335      fprintf(file, ", ");
1336
1337   for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1338      if (inst->src[i].negate)
1339         fprintf(file, "-");
1340      if (inst->src[i].abs)
1341         fprintf(file, "|");
1342      switch (inst->src[i].file) {
1343      case GRF:
1344         fprintf(file, "vgrf%d", inst->src[i].reg);
1345         break;
1346      case ATTR:
1347         fprintf(file, "attr%d", inst->src[i].reg);
1348         break;
1349      case UNIFORM:
1350         fprintf(file, "u%d", inst->src[i].reg);
1351         break;
1352      case IMM:
1353         switch (inst->src[i].type) {
1354         case BRW_REGISTER_TYPE_F:
1355            fprintf(file, "%fF", inst->src[i].fixed_hw_reg.dw1.f);
1356            break;
1357         case BRW_REGISTER_TYPE_D:
1358            fprintf(file, "%dD", inst->src[i].fixed_hw_reg.dw1.d);
1359            break;
1360         case BRW_REGISTER_TYPE_UD:
1361            fprintf(file, "%uU", inst->src[i].fixed_hw_reg.dw1.ud);
1362            break;
1363         case BRW_REGISTER_TYPE_VF:
1364            fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1365                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
1366                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
1367                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
1368                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
1369            break;
1370         default:
1371            fprintf(file, "???");
1372            break;
1373         }
1374         break;
1375      case HW_REG:
1376         if (inst->src[i].fixed_hw_reg.negate)
1377            fprintf(file, "-");
1378         if (inst->src[i].fixed_hw_reg.abs)
1379            fprintf(file, "|");
1380         if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1381            switch (inst->src[i].fixed_hw_reg.nr) {
1382            case BRW_ARF_NULL:
1383               fprintf(file, "null");
1384               break;
1385            case BRW_ARF_ADDRESS:
1386               fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
1387               break;
1388            case BRW_ARF_ACCUMULATOR:
1389               fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
1390               break;
1391            case BRW_ARF_FLAG:
1392               fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
1393                                inst->src[i].fixed_hw_reg.subnr);
1394               break;
1395            default:
1396               fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
1397                                  inst->src[i].fixed_hw_reg.subnr);
1398               break;
1399            }
1400         } else {
1401            fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
1402         }
1403         if (inst->src[i].fixed_hw_reg.subnr)
1404            fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
1405         if (inst->src[i].fixed_hw_reg.abs)
1406            fprintf(file, "|");
1407         break;
1408      case BAD_FILE:
1409         fprintf(file, "(null)");
1410         break;
1411      default:
1412         fprintf(file, "???");
1413         break;
1414      }
1415
1416      /* Don't print .0; and only VGRFs have reg_offsets and sizes */
1417      if (inst->src[i].reg_offset != 0 &&
1418          inst->src[i].file == GRF &&
1419          alloc.sizes[inst->src[i].reg] != 1)
1420         fprintf(file, ".%d", inst->src[i].reg_offset);
1421
1422      if (inst->src[i].file != IMM) {
1423         static const char *chans[4] = {"x", "y", "z", "w"};
1424         fprintf(file, ".");
1425         for (int c = 0; c < 4; c++) {
1426            fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1427         }
1428      }
1429
1430      if (inst->src[i].abs)
1431         fprintf(file, "|");
1432
1433      if (inst->src[i].file != IMM) {
1434         fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
1435      }
1436
1437      if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1438         fprintf(file, ", ");
1439   }
1440
1441   fprintf(file, "\n");
1442}
1443
1444
1445static inline struct brw_reg
1446attribute_to_hw_reg(int attr, bool interleaved)
1447{
1448   if (interleaved)
1449      return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
1450   else
1451      return brw_vec8_grf(attr, 0);
1452}
1453
1454
1455/**
1456 * Replace each register of type ATTR in this->instructions with a reference
1457 * to a fixed HW register.
1458 *
1459 * If interleaved is true, then each attribute takes up half a register, with
1460 * register N containing attribute 2*N in its first half and attribute 2*N+1
1461 * in its second half (this corresponds to the payload setup used by geometry
1462 * shaders in "single" or "dual instanced" dispatch mode).  If interleaved is
1463 * false, then each attribute takes up a whole register, with register N
1464 * containing attribute N (this corresponds to the payload setup used by
1465 * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
1466 */
1467void
1468vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
1469                                          bool interleaved)
1470{
1471   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1472      /* We have to support ATTR as a destination for GL_FIXED fixup. */
1473      if (inst->dst.file == ATTR) {
1474	 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
1475
1476         /* All attributes used in the shader need to have been assigned a
1477          * hardware register by the caller
1478          */
1479         assert(grf != 0);
1480
1481	 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1482	 reg.type = inst->dst.type;
1483	 reg.dw1.bits.writemask = inst->dst.writemask;
1484
1485	 inst->dst.file = HW_REG;
1486	 inst->dst.fixed_hw_reg = reg;
1487      }
1488
1489      for (int i = 0; i < 3; i++) {
1490	 if (inst->src[i].file != ATTR)
1491	    continue;
1492
1493	 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
1494
1495         /* All attributes used in the shader need to have been assigned a
1496          * hardware register by the caller
1497          */
1498         assert(grf != 0);
1499
1500	 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1501	 reg.dw1.bits.swizzle = inst->src[i].swizzle;
1502         reg.type = inst->src[i].type;
1503	 if (inst->src[i].abs)
1504	    reg = brw_abs(reg);
1505	 if (inst->src[i].negate)
1506	    reg = negate(reg);
1507
1508	 inst->src[i].file = HW_REG;
1509	 inst->src[i].fixed_hw_reg = reg;
1510      }
1511   }
1512}
1513
1514int
1515vec4_vs_visitor::setup_attributes(int payload_reg)
1516{
1517   int nr_attributes;
1518   int attribute_map[VERT_ATTRIB_MAX + 1];
1519   memset(attribute_map, 0, sizeof(attribute_map));
1520
1521   nr_attributes = 0;
1522   for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1523      if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
1524	 attribute_map[i] = payload_reg + nr_attributes;
1525	 nr_attributes++;
1526      }
1527   }
1528
1529   /* VertexID is stored by the VF as the last vertex element, but we
1530    * don't represent it with a flag in inputs_read, so we call it
1531    * VERT_ATTRIB_MAX.
1532    */
1533   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
1534      attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1535      nr_attributes++;
1536   }
1537
1538   lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
1539
1540   /* The BSpec says we always have to read at least one thing from
1541    * the VF, and it appears that the hardware wedges otherwise.
1542    */
1543   if (nr_attributes == 0)
1544      nr_attributes = 1;
1545
1546   prog_data->urb_read_length = (nr_attributes + 1) / 2;
1547
1548   unsigned vue_entries =
1549      MAX2(nr_attributes, prog_data->vue_map.num_slots);
1550
1551   if (devinfo->gen == 6)
1552      prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8;
1553   else
1554      prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4;
1555
1556   return payload_reg + nr_attributes;
1557}
1558
1559int
1560vec4_visitor::setup_uniforms(int reg)
1561{
1562   prog_data->base.dispatch_grf_start_reg = reg;
1563
1564   /* The pre-gen6 VS requires that some push constants get loaded no
1565    * matter what, or the GPU would hang.
1566    */
1567   if (devinfo->gen < 6 && this->uniforms == 0) {
1568      assert(this->uniforms < this->uniform_array_size);
1569      this->uniform_vector_size[this->uniforms] = 1;
1570
1571      stage_prog_data->param =
1572         reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
1573      for (unsigned int i = 0; i < 4; i++) {
1574	 unsigned int slot = this->uniforms * 4 + i;
1575	 static gl_constant_value zero = { 0.0 };
1576	 stage_prog_data->param[slot] = &zero;
1577      }
1578
1579      this->uniforms++;
1580      reg++;
1581   } else {
1582      reg += ALIGN(uniforms, 2) / 2;
1583   }
1584
1585   stage_prog_data->nr_params = this->uniforms * 4;
1586
1587   prog_data->base.curb_read_length =
1588      reg - prog_data->base.dispatch_grf_start_reg;
1589
1590   return reg;
1591}
1592
1593void
1594vec4_vs_visitor::setup_payload(void)
1595{
1596   int reg = 0;
1597
1598   /* The payload always contains important data in g0, which contains
1599    * the URB handles that are passed on to the URB write at the end
1600    * of the thread.  So, we always start push constants at g1.
1601    */
1602   reg++;
1603
1604   reg = setup_uniforms(reg);
1605
1606   reg = setup_attributes(reg);
1607
1608   this->first_non_payload_grf = reg;
1609}
1610
1611void
1612vec4_visitor::assign_binding_table_offsets()
1613{
1614   assign_common_binding_table_offsets(0);
1615}
1616
1617src_reg
1618vec4_visitor::get_timestamp()
1619{
1620   assert(devinfo->gen >= 7);
1621
1622   src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1623                                BRW_ARF_TIMESTAMP,
1624                                0,
1625                                0,
1626                                0,
1627                                BRW_REGISTER_TYPE_UD,
1628                                BRW_VERTICAL_STRIDE_0,
1629                                BRW_WIDTH_4,
1630                                BRW_HORIZONTAL_STRIDE_4,
1631                                BRW_SWIZZLE_XYZW,
1632                                WRITEMASK_XYZW));
1633
1634   dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1635
1636   vec4_instruction *mov = emit(MOV(dst, ts));
1637   /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1638    * even if it's not enabled in the dispatch.
1639    */
1640   mov->force_writemask_all = true;
1641
1642   return src_reg(dst);
1643}
1644
1645void
1646vec4_visitor::emit_shader_time_begin()
1647{
1648   current_annotation = "shader time start";
1649   shader_start_time = get_timestamp();
1650}
1651
1652void
1653vec4_visitor::emit_shader_time_end()
1654{
1655   current_annotation = "shader time end";
1656   src_reg shader_end_time = get_timestamp();
1657
1658
1659   /* Check that there weren't any timestamp reset events (assuming these
1660    * were the only two timestamp reads that happened).
1661    */
1662   src_reg reset_end = shader_end_time;
1663   reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1664   vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
1665   test->conditional_mod = BRW_CONDITIONAL_Z;
1666
1667   emit(IF(BRW_PREDICATE_NORMAL));
1668
1669   /* Take the current timestamp and get the delta. */
1670   shader_start_time.negate = true;
1671   dst_reg diff = dst_reg(this, glsl_type::uint_type);
1672   emit(ADD(diff, shader_start_time, shader_end_time));
1673
1674   /* If there were no instructions between the two timestamp gets, the diff
1675    * is 2 cycles.  Remove that overhead, so I can forget about that when
1676    * trying to determine the time taken for single instructions.
1677    */
1678   emit(ADD(diff, src_reg(diff), src_reg(-2u)));
1679
1680   emit_shader_time_write(0, src_reg(diff));
1681   emit_shader_time_write(1, src_reg(1u));
1682   emit(BRW_OPCODE_ELSE);
1683   emit_shader_time_write(2, src_reg(1u));
1684   emit(BRW_OPCODE_ENDIF);
1685}
1686
1687void
1688vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
1689{
1690   dst_reg dst =
1691      dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1692
1693   dst_reg offset = dst;
1694   dst_reg time = dst;
1695   time.reg_offset++;
1696
1697   offset.type = BRW_REGISTER_TYPE_UD;
1698   int index = shader_time_index * 3 + shader_time_subindex;
1699   emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE)));
1700
1701   time.type = BRW_REGISTER_TYPE_UD;
1702   emit(MOV(time, src_reg(value)));
1703
1704   vec4_instruction *inst =
1705      emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1706   inst->mlen = 2;
1707}
1708
1709bool
1710vec4_visitor::run(gl_clip_plane *clip_planes)
1711{
1712   sanity_param_count = prog->Parameters->NumParameters;
1713
1714   if (shader_time_index >= 0)
1715      emit_shader_time_begin();
1716
1717   assign_binding_table_offsets();
1718
1719   emit_prolog();
1720
1721   /* Generate VS IR for main().  (the visitor only descends into
1722    * functions called "main").
1723    */
1724   if (shader) {
1725      visit_instructions(shader->base.ir);
1726   } else {
1727      emit_program_code();
1728   }
1729   base_ir = NULL;
1730
1731   if (key->userclip_active && !prog->UsesClipDistanceOut)
1732      setup_uniform_clipplane_values(clip_planes);
1733
1734   emit_thread_end();
1735
1736   calculate_cfg();
1737
1738   /* Before any optimization, push array accesses out to scratch
1739    * space where we need them to be.  This pass may allocate new
1740    * virtual GRFs, so we want to do it early.  It also makes sure
1741    * that we have reladdr computations available for CSE, since we'll
1742    * often do repeated subexpressions for those.
1743    */
1744   if (shader) {
1745      move_grf_array_access_to_scratch();
1746      move_uniform_array_access_to_pull_constants();
1747   } else {
1748      /* The ARB_vertex_program frontend emits pull constant loads directly
1749       * rather than using reladdr, so we don't need to walk through all the
1750       * instructions looking for things to move.  There isn't anything.
1751       *
1752       * We do still need to split things to vec4 size.
1753       */
1754      split_uniform_registers();
1755   }
1756   pack_uniform_registers();
1757   move_push_constants_to_pull_constants();
1758   split_virtual_grfs();
1759
1760#define OPT(pass, args...) ({                                          \
1761      pass_num++;                                                      \
1762      bool this_progress = pass(args);                                 \
1763                                                                       \
1764      if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {  \
1765         char filename[64];                                            \
1766         snprintf(filename, 64, "%s-%04d-%02d-%02d-" #pass,            \
1767                  stage_abbrev, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
1768                                                                       \
1769         backend_shader::dump_instructions(filename);                  \
1770      }                                                                \
1771                                                                       \
1772      progress = progress || this_progress;                            \
1773      this_progress;                                                   \
1774   })
1775
1776
1777   if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
1778      char filename[64];
1779      snprintf(filename, 64, "%s-%04d-00-start",
1780               stage_abbrev, shader_prog ? shader_prog->Name : 0);
1781
1782      backend_shader::dump_instructions(filename);
1783   }
1784
1785   bool progress;
1786   int iteration = 0;
1787   int pass_num = 0;
1788   do {
1789      progress = false;
1790      pass_num = 0;
1791      iteration++;
1792
1793      OPT(opt_reduce_swizzle);
1794      OPT(dead_code_eliminate);
1795      OPT(dead_control_flow_eliminate, this);
1796      OPT(opt_copy_propagation);
1797      OPT(opt_cse);
1798      OPT(opt_algebraic);
1799      OPT(opt_register_coalesce);
1800      OPT(eliminate_find_live_channel);
1801   } while (progress);
1802
1803   pass_num = 0;
1804
1805   if (OPT(opt_vector_float)) {
1806      OPT(opt_cse);
1807      OPT(opt_copy_propagation, false);
1808      OPT(opt_copy_propagation, true);
1809      OPT(dead_code_eliminate);
1810   }
1811
1812   if (failed)
1813      return false;
1814
1815   setup_payload();
1816
1817   if (false) {
1818      /* Debug of register spilling: Go spill everything. */
1819      const int grf_count = alloc.count;
1820      float spill_costs[alloc.count];
1821      bool no_spill[alloc.count];
1822      evaluate_spill_costs(spill_costs, no_spill);
1823      for (int i = 0; i < grf_count; i++) {
1824         if (no_spill[i])
1825            continue;
1826         spill_reg(i);
1827      }
1828   }
1829
1830   while (!reg_allocate()) {
1831      if (failed)
1832         return false;
1833   }
1834
1835   opt_schedule_instructions();
1836
1837   opt_set_dependency_control();
1838
1839   /* If any state parameters were appended, then ParameterValues could have
1840    * been realloced, in which case the driver uniform storage set up by
1841    * _mesa_associate_uniform_storage() would point to freed memory.  Make
1842    * sure that didn't happen.
1843    */
1844   assert(sanity_param_count == prog->Parameters->NumParameters);
1845
1846   return !failed;
1847}
1848
1849} /* namespace brw */
1850
1851extern "C" {
1852
1853/**
1854 * Compile a vertex shader.
1855 *
1856 * Returns the final assembly and the program's size.
1857 */
1858const unsigned *
1859brw_vs_emit(struct brw_context *brw,
1860            struct gl_shader_program *prog,
1861            struct brw_vs_compile *c,
1862            struct brw_vs_prog_data *prog_data,
1863            void *mem_ctx,
1864            unsigned *final_assembly_size)
1865{
1866   bool start_busy = false;
1867   double start_time = 0;
1868   const unsigned *assembly = NULL;
1869
1870   if (unlikely(brw->perf_debug)) {
1871      start_busy = (brw->batch.last_bo &&
1872                    drm_intel_bo_busy(brw->batch.last_bo));
1873      start_time = get_time();
1874   }
1875
1876   struct brw_shader *shader = NULL;
1877   if (prog)
1878      shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
1879
1880   int st_index = -1;
1881   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
1882      st_index = brw_get_shader_time_index(brw, prog, &c->vp->program.Base,
1883                                           ST_VS);
1884
1885   if (unlikely(INTEL_DEBUG & DEBUG_VS))
1886      brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
1887
1888   if (brw->intelScreen->compiler->scalar_vs) {
1889      if (!c->vp->program.Base.nir) {
1890         /* Normally we generate NIR in LinkShader() or
1891          * ProgramStringNotify(), but Mesa's fixed-function vertex program
1892          * handling doesn't notify the driver at all.  Just do it here, at
1893          * the last minute, even though it's lame.
1894          */
1895         assert(c->vp->program.Base.Id == 0 && prog == NULL);
1896         c->vp->program.Base.nir =
1897            brw_create_nir(brw, NULL, &c->vp->program.Base, MESA_SHADER_VERTEX);
1898      }
1899
1900      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
1901
1902      fs_visitor v(brw->intelScreen->compiler, brw,
1903                   mem_ctx, MESA_SHADER_VERTEX, &c->key,
1904                   &prog_data->base.base, prog, &c->vp->program.Base,
1905                   8, st_index);
1906      if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) {
1907         if (prog) {
1908            prog->LinkStatus = false;
1909            ralloc_strcat(&prog->InfoLog, v.fail_msg);
1910         }
1911
1912         _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
1913                       v.fail_msg);
1914
1915         return NULL;
1916      }
1917
1918      fs_generator g(brw->intelScreen->compiler, brw,
1919                     mem_ctx, (void *) &c->key, &prog_data->base.base,
1920                     &c->vp->program.Base, v.promoted_constants,
1921                     v.runtime_check_aads_emit, "VS");
1922      if (INTEL_DEBUG & DEBUG_VS) {
1923         char *name;
1924         if (prog) {
1925            name = ralloc_asprintf(mem_ctx, "%s vertex shader %d",
1926                                   prog->Label ? prog->Label : "unnamed",
1927                                   prog->Name);
1928         } else {
1929            name = ralloc_asprintf(mem_ctx, "vertex program %d",
1930                                   c->vp->program.Base.Id);
1931         }
1932         g.enable_debug(name);
1933      }
1934      g.generate_code(v.cfg, 8);
1935      assembly = g.get_assembly(final_assembly_size);
1936
1937      c->base.last_scratch = v.last_scratch;
1938   }
1939
1940   if (!assembly) {
1941      prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
1942
1943      vec4_vs_visitor v(brw->intelScreen->compiler, brw,
1944                        c, prog_data, prog, mem_ctx, st_index,
1945                        !_mesa_is_gles3(&brw->ctx));
1946      if (!v.run(brw_select_clip_planes(&brw->ctx))) {
1947         if (prog) {
1948            prog->LinkStatus = false;
1949            ralloc_strcat(&prog->InfoLog, v.fail_msg);
1950         }
1951
1952         _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
1953                       v.fail_msg);
1954
1955         return NULL;
1956      }
1957
1958      vec4_generator g(brw->intelScreen->compiler, brw,
1959                       prog, &c->vp->program.Base, &prog_data->base,
1960                       mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS");
1961      assembly = g.generate_assembly(v.cfg, final_assembly_size);
1962   }
1963
1964   if (unlikely(brw->perf_debug) && shader) {
1965      if (shader->compiled_once) {
1966         brw_vs_debug_recompile(brw, prog, &c->key);
1967      }
1968      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
1969         perf_debug("VS compile took %.03f ms and stalled the GPU\n",
1970                    (get_time() - start_time) * 1000);
1971      }
1972      shader->compiled_once = true;
1973   }
1974
1975   return assembly;
1976}
1977
1978
1979void
1980brw_vue_setup_prog_key_for_precompile(struct gl_context *ctx,
1981                                      struct brw_vue_prog_key *key,
1982                                      GLuint id, struct gl_program *prog)
1983{
1984   struct brw_context *brw = brw_context(ctx);
1985   key->program_string_id = id;
1986
1987   brw_setup_tex_for_precompile(brw, &key->tex, prog);
1988}
1989
1990} /* extern "C" */
1991