brw_vec4.cpp revision 7e7c40ff98cc2b930bc3113609ace5430f2bdc95
1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_vec4.h"
25extern "C" {
26#include "main/macros.h"
27#include "program/prog_parameter.h"
28}
29
30#define MAX_INSTRUCTION (1 << 30)
31
32namespace brw {
33
34bool
35vec4_instruction::is_tex()
36{
37   return (opcode == SHADER_OPCODE_TEX ||
38	   opcode == SHADER_OPCODE_TXD ||
39	   opcode == SHADER_OPCODE_TXF ||
40	   opcode == SHADER_OPCODE_TXL ||
41	   opcode == SHADER_OPCODE_TXS);
42}
43
44bool
45vec4_instruction::is_math()
46{
47   return (opcode == SHADER_OPCODE_RCP ||
48	   opcode == SHADER_OPCODE_RSQ ||
49	   opcode == SHADER_OPCODE_SQRT ||
50	   opcode == SHADER_OPCODE_EXP2 ||
51	   opcode == SHADER_OPCODE_LOG2 ||
52	   opcode == SHADER_OPCODE_SIN ||
53	   opcode == SHADER_OPCODE_COS ||
54	   opcode == SHADER_OPCODE_INT_QUOTIENT ||
55	   opcode == SHADER_OPCODE_INT_REMAINDER ||
56	   opcode == SHADER_OPCODE_POW);
57}
58/**
59 * Returns how many MRFs an opcode will write over.
60 *
61 * Note that this is not the 0 or 1 implied writes in an actual gen
62 * instruction -- the generate_* functions generate additional MOVs
63 * for setup.
64 */
65int
66vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
67{
68   if (inst->mlen == 0)
69      return 0;
70
71   switch (inst->opcode) {
72   case SHADER_OPCODE_RCP:
73   case SHADER_OPCODE_RSQ:
74   case SHADER_OPCODE_SQRT:
75   case SHADER_OPCODE_EXP2:
76   case SHADER_OPCODE_LOG2:
77   case SHADER_OPCODE_SIN:
78   case SHADER_OPCODE_COS:
79      return 1;
80   case SHADER_OPCODE_POW:
81      return 2;
82   case VS_OPCODE_URB_WRITE:
83      return 1;
84   case VS_OPCODE_PULL_CONSTANT_LOAD:
85      return 2;
86   case VS_OPCODE_SCRATCH_READ:
87      return 2;
88   case VS_OPCODE_SCRATCH_WRITE:
89      return 3;
90   default:
91      assert(!"not reached");
92      return inst->mlen;
93   }
94}
95
96bool
97src_reg::equals(src_reg *r)
98{
99   return (file == r->file &&
100	   reg == r->reg &&
101	   reg_offset == r->reg_offset &&
102	   type == r->type &&
103	   negate == r->negate &&
104	   abs == r->abs &&
105	   swizzle == r->swizzle &&
106	   !reladdr && !r->reladdr &&
107	   memcmp(&fixed_hw_reg, &r->fixed_hw_reg,
108		  sizeof(fixed_hw_reg)) == 0 &&
109	   imm.u == r->imm.u);
110}
111
112void
113vec4_visitor::calculate_live_intervals()
114{
115   int *def = ralloc_array(mem_ctx, int, virtual_grf_count);
116   int *use = ralloc_array(mem_ctx, int, virtual_grf_count);
117   int loop_depth = 0;
118   int loop_start = 0;
119
120   if (this->live_intervals_valid)
121      return;
122
123   for (int i = 0; i < virtual_grf_count; i++) {
124      def[i] = MAX_INSTRUCTION;
125      use[i] = -1;
126   }
127
128   int ip = 0;
129   foreach_list(node, &this->instructions) {
130      vec4_instruction *inst = (vec4_instruction *)node;
131
132      if (inst->opcode == BRW_OPCODE_DO) {
133	 if (loop_depth++ == 0)
134	    loop_start = ip;
135      } else if (inst->opcode == BRW_OPCODE_WHILE) {
136	 loop_depth--;
137
138	 if (loop_depth == 0) {
139	    /* Patches up the use of vars marked for being live across
140	     * the whole loop.
141	     */
142	    for (int i = 0; i < virtual_grf_count; i++) {
143	       if (use[i] == loop_start) {
144		  use[i] = ip;
145	       }
146	    }
147	 }
148      } else {
149	 for (unsigned int i = 0; i < 3; i++) {
150	    if (inst->src[i].file == GRF) {
151	       int reg = inst->src[i].reg;
152
153	       if (!loop_depth) {
154		  use[reg] = ip;
155	       } else {
156		  def[reg] = MIN2(loop_start, def[reg]);
157		  use[reg] = loop_start;
158
159		  /* Nobody else is going to go smash our start to
160		   * later in the loop now, because def[reg] now
161		   * points before the bb header.
162		   */
163	       }
164	    }
165	 }
166	 if (inst->dst.file == GRF) {
167	    int reg = inst->dst.reg;
168
169	    if (!loop_depth) {
170	       def[reg] = MIN2(def[reg], ip);
171	    } else {
172	       def[reg] = MIN2(def[reg], loop_start);
173	    }
174	 }
175      }
176
177      ip++;
178   }
179
180   ralloc_free(this->virtual_grf_def);
181   ralloc_free(this->virtual_grf_use);
182   this->virtual_grf_def = def;
183   this->virtual_grf_use = use;
184
185   this->live_intervals_valid = true;
186}
187
188bool
189vec4_visitor::virtual_grf_interferes(int a, int b)
190{
191   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
192   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
193
194   /* We can't handle dead register writes here, without iterating
195    * over the whole instruction stream to find every single dead
196    * write to that register to compare to the live interval of the
197    * other register.  Just assert that dead_code_eliminate() has been
198    * called.
199    */
200   assert((this->virtual_grf_use[a] != -1 ||
201	   this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
202	  (this->virtual_grf_use[b] != -1 ||
203	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
204
205   return start < end;
206}
207
208/**
209 * Must be called after calculate_live_intervales() to remove unused
210 * writes to registers -- register allocation will fail otherwise
211 * because something deffed but not used won't be considered to
212 * interfere with other regs.
213 */
214bool
215vec4_visitor::dead_code_eliminate()
216{
217   bool progress = false;
218   int pc = 0;
219
220   calculate_live_intervals();
221
222   foreach_list_safe(node, &this->instructions) {
223      vec4_instruction *inst = (vec4_instruction *)node;
224
225      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
226	 inst->remove();
227	 progress = true;
228      }
229
230      pc++;
231   }
232
233   if (progress)
234      live_intervals_valid = false;
235
236   return progress;
237}
238
239void
240vec4_visitor::split_uniform_registers()
241{
242   /* Prior to this, uniforms have been in an array sized according to
243    * the number of vector uniforms present, sparsely filled (so an
244    * aggregate results in reg indices being skipped over).  Now we're
245    * going to cut those aggregates up so each .reg index is one
246    * vector.  The goal is to make elimination of unused uniform
247    * components easier later.
248    */
249   foreach_list(node, &this->instructions) {
250      vec4_instruction *inst = (vec4_instruction *)node;
251
252      for (int i = 0 ; i < 3; i++) {
253	 if (inst->src[i].file != UNIFORM)
254	    continue;
255
256	 assert(!inst->src[i].reladdr);
257
258	 inst->src[i].reg += inst->src[i].reg_offset;
259	 inst->src[i].reg_offset = 0;
260      }
261   }
262
263   /* Update that everything is now vector-sized. */
264   for (int i = 0; i < this->uniforms; i++) {
265      this->uniform_size[i] = 1;
266   }
267}
268
269void
270vec4_visitor::pack_uniform_registers()
271{
272   bool uniform_used[this->uniforms];
273   int new_loc[this->uniforms];
274   int new_chan[this->uniforms];
275
276   memset(uniform_used, 0, sizeof(uniform_used));
277   memset(new_loc, 0, sizeof(new_loc));
278   memset(new_chan, 0, sizeof(new_chan));
279
280   /* Find which uniform vectors are actually used by the program.  We
281    * expect unused vector elements when we've moved array access out
282    * to pull constants, and from some GLSL code generators like wine.
283    */
284   foreach_list(node, &this->instructions) {
285      vec4_instruction *inst = (vec4_instruction *)node;
286
287      for (int i = 0 ; i < 3; i++) {
288	 if (inst->src[i].file != UNIFORM)
289	    continue;
290
291	 uniform_used[inst->src[i].reg] = true;
292      }
293   }
294
295   int new_uniform_count = 0;
296
297   /* Now, figure out a packing of the live uniform vectors into our
298    * push constants.
299    */
300   for (int src = 0; src < uniforms; src++) {
301      int size = this->uniform_vector_size[src];
302
303      if (!uniform_used[src]) {
304	 this->uniform_vector_size[src] = 0;
305	 continue;
306      }
307
308      int dst;
309      /* Find the lowest place we can slot this uniform in. */
310      for (dst = 0; dst < src; dst++) {
311	 if (this->uniform_vector_size[dst] + size <= 4)
312	    break;
313      }
314
315      if (src == dst) {
316	 new_loc[src] = dst;
317	 new_chan[src] = 0;
318      } else {
319	 new_loc[src] = dst;
320	 new_chan[src] = this->uniform_vector_size[dst];
321
322	 /* Move the references to the data */
323	 for (int j = 0; j < size; j++) {
324	    c->prog_data.param[dst * 4 + new_chan[src] + j] =
325	       c->prog_data.param[src * 4 + j];
326	 }
327
328	 this->uniform_vector_size[dst] += size;
329	 this->uniform_vector_size[src] = 0;
330      }
331
332      new_uniform_count = MAX2(new_uniform_count, dst + 1);
333   }
334
335   this->uniforms = new_uniform_count;
336
337   /* Now, update the instructions for our repacked uniforms. */
338   foreach_list(node, &this->instructions) {
339      vec4_instruction *inst = (vec4_instruction *)node;
340
341      for (int i = 0 ; i < 3; i++) {
342	 int src = inst->src[i].reg;
343
344	 if (inst->src[i].file != UNIFORM)
345	    continue;
346
347	 inst->src[i].reg = new_loc[src];
348
349	 int sx = BRW_GET_SWZ(inst->src[i].swizzle, 0) + new_chan[src];
350	 int sy = BRW_GET_SWZ(inst->src[i].swizzle, 1) + new_chan[src];
351	 int sz = BRW_GET_SWZ(inst->src[i].swizzle, 2) + new_chan[src];
352	 int sw = BRW_GET_SWZ(inst->src[i].swizzle, 3) + new_chan[src];
353	 inst->src[i].swizzle = BRW_SWIZZLE4(sx, sy, sz, sw);
354      }
355   }
356}
357
358bool
359src_reg::is_zero() const
360{
361   if (file != IMM)
362      return false;
363
364   if (type == BRW_REGISTER_TYPE_F) {
365      return imm.f == 0.0;
366   } else {
367      return imm.i == 0;
368   }
369}
370
371bool
372src_reg::is_one() const
373{
374   if (file != IMM)
375      return false;
376
377   if (type == BRW_REGISTER_TYPE_F) {
378      return imm.f == 1.0;
379   } else {
380      return imm.i == 1;
381   }
382}
383
384/**
385 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
386 *
387 * While GLSL IR also performs this optimization, we end up with it in
388 * our instruction stream for a couple of reasons.  One is that we
389 * sometimes generate silly instructions, for example in array access
390 * where we'll generate "ADD offset, index, base" even if base is 0.
391 * The other is that GLSL IR's constant propagation doesn't track the
392 * components of aggregates, so some VS patterns (initialize matrix to
393 * 0, accumulate in vertex blending factors) end up breaking down to
394 * instructions involving 0.
395 */
396bool
397vec4_visitor::opt_algebraic()
398{
399   bool progress = false;
400
401   foreach_list(node, &this->instructions) {
402      vec4_instruction *inst = (vec4_instruction *)node;
403
404      switch (inst->opcode) {
405      case BRW_OPCODE_ADD:
406	 if (inst->src[1].is_zero()) {
407	    inst->opcode = BRW_OPCODE_MOV;
408	    inst->src[1] = src_reg();
409	    progress = true;
410	 }
411	 break;
412
413      case BRW_OPCODE_MUL:
414	 if (inst->src[1].is_zero()) {
415	    inst->opcode = BRW_OPCODE_MOV;
416	    switch (inst->src[0].type) {
417	    case BRW_REGISTER_TYPE_F:
418	       inst->src[0] = src_reg(0.0f);
419	       break;
420	    case BRW_REGISTER_TYPE_D:
421	       inst->src[0] = src_reg(0);
422	       break;
423	    case BRW_REGISTER_TYPE_UD:
424	       inst->src[0] = src_reg(0u);
425	       break;
426	    default:
427	       assert(!"not reached");
428	       inst->src[0] = src_reg(0.0f);
429	       break;
430	    }
431	    inst->src[1] = src_reg();
432	    progress = true;
433	 } else if (inst->src[1].is_one()) {
434	    inst->opcode = BRW_OPCODE_MOV;
435	    inst->src[1] = src_reg();
436	    progress = true;
437	 }
438	 break;
439      default:
440	 break;
441      }
442   }
443
444   if (progress)
445      this->live_intervals_valid = false;
446
447   return progress;
448}
449
450/**
451 * Only a limited number of hardware registers may be used for push
452 * constants, so this turns access to the overflowed constants into
453 * pull constants.
454 */
455void
456vec4_visitor::move_push_constants_to_pull_constants()
457{
458   int pull_constant_loc[this->uniforms];
459
460   /* Only allow 32 registers (256 uniform components) as push constants,
461    * which is the limit on gen6.
462    */
463   int max_uniform_components = 32 * 8;
464   if (this->uniforms * 4 <= max_uniform_components)
465      return;
466
467   /* Make some sort of choice as to which uniforms get sent to pull
468    * constants.  We could potentially do something clever here like
469    * look for the most infrequently used uniform vec4s, but leave
470    * that for later.
471    */
472   for (int i = 0; i < this->uniforms * 4; i += 4) {
473      pull_constant_loc[i / 4] = -1;
474
475      if (i >= max_uniform_components) {
476	 const float **values = &prog_data->param[i];
477
478	 /* Try to find an existing copy of this uniform in the pull
479	  * constants if it was part of an array access already.
480	  */
481	 for (unsigned int j = 0; j < prog_data->nr_pull_params; j += 4) {
482	    int matches;
483
484	    for (matches = 0; matches < 4; matches++) {
485	       if (prog_data->pull_param[j + matches] != values[matches])
486		  break;
487	    }
488
489	    if (matches == 4) {
490	       pull_constant_loc[i / 4] = j / 4;
491	       break;
492	    }
493	 }
494
495	 if (pull_constant_loc[i / 4] == -1) {
496	    assert(prog_data->nr_pull_params % 4 == 0);
497	    pull_constant_loc[i / 4] = prog_data->nr_pull_params / 4;
498
499	    for (int j = 0; j < 4; j++) {
500	       prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
501	    }
502	 }
503      }
504   }
505
506   /* Now actually rewrite usage of the things we've moved to pull
507    * constants.
508    */
509   foreach_list_safe(node, &this->instructions) {
510      vec4_instruction *inst = (vec4_instruction *)node;
511
512      for (int i = 0 ; i < 3; i++) {
513	 if (inst->src[i].file != UNIFORM ||
514	     pull_constant_loc[inst->src[i].reg] == -1)
515	    continue;
516
517	 int uniform = inst->src[i].reg;
518
519	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
520
521	 emit_pull_constant_load(inst, temp, inst->src[i],
522				 pull_constant_loc[uniform]);
523
524	 inst->src[i].file = temp.file;
525	 inst->src[i].reg = temp.reg;
526	 inst->src[i].reg_offset = temp.reg_offset;
527	 inst->src[i].reladdr = NULL;
528      }
529   }
530
531   /* Repack push constants to remove the now-unused ones. */
532   pack_uniform_registers();
533}
534
535/*
536 * Tries to reduce extra MOV instructions by taking GRFs that get just
537 * written and then MOVed into an MRF and making the original write of
538 * the GRF write directly to the MRF instead.
539 */
540bool
541vec4_visitor::opt_compute_to_mrf()
542{
543   bool progress = false;
544   int next_ip = 0;
545
546   calculate_live_intervals();
547
548   foreach_list_safe(node, &this->instructions) {
549      vec4_instruction *inst = (vec4_instruction *)node;
550
551      int ip = next_ip;
552      next_ip++;
553
554      if (inst->opcode != BRW_OPCODE_MOV ||
555	  inst->predicate ||
556	  inst->dst.file != MRF || inst->src[0].file != GRF ||
557	  inst->dst.type != inst->src[0].type ||
558	  inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
559	 continue;
560
561      int mrf = inst->dst.reg;
562
563      /* Can't compute-to-MRF this GRF if someone else was going to
564       * read it later.
565       */
566      if (this->virtual_grf_use[inst->src[0].reg] > ip)
567	 continue;
568
569      /* We need to check interference with the MRF between this
570       * instruction and the earliest instruction involved in writing
571       * the GRF we're eliminating.  To do that, keep track of which
572       * of our source channels we've seen initialized.
573       */
574      bool chans_needed[4] = {false, false, false, false};
575      int chans_remaining = 0;
576      for (int i = 0; i < 4; i++) {
577	 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i);
578
579	 if (!(inst->dst.writemask & (1 << i)))
580	    continue;
581
582	 /* We don't handle compute-to-MRF across a swizzle.  We would
583	  * need to be able to rewrite instructions above to output
584	  * results to different channels.
585	  */
586	 if (chan != i)
587	    chans_remaining = 5;
588
589	 if (!chans_needed[chan]) {
590	    chans_needed[chan] = true;
591	    chans_remaining++;
592	 }
593      }
594      if (chans_remaining > 4)
595	 continue;
596
597      /* Now walk up the instruction stream trying to see if we can
598       * rewrite everything writing to the GRF into the MRF instead.
599       */
600      vec4_instruction *scan_inst;
601      for (scan_inst = (vec4_instruction *)inst->prev;
602	   scan_inst->prev != NULL;
603	   scan_inst = (vec4_instruction *)scan_inst->prev) {
604	 if (scan_inst->dst.file == GRF &&
605	     scan_inst->dst.reg == inst->src[0].reg &&
606	     scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
607	    /* Found something writing to the reg we want to turn into
608	     * a compute-to-MRF.
609	     */
610
611	    /* SEND instructions can't have MRF as a destination. */
612	    if (scan_inst->mlen)
613	       break;
614
615	    if (intel->gen >= 6) {
616	       /* gen6 math instructions must have the destination be
617		* GRF, so no compute-to-MRF for them.
618		*/
619	       if (scan_inst->is_math()) {
620		  break;
621	       }
622	    }
623
624	    /* Mark which channels we found unconditional writes for. */
625	    if (!scan_inst->predicate) {
626	       for (int i = 0; i < 4; i++) {
627		  if (scan_inst->dst.writemask & (1 << i) &&
628		      chans_needed[i]) {
629		     chans_needed[i] = false;
630		     chans_remaining--;
631		  }
632	       }
633	    }
634
635	    if (chans_remaining == 0)
636	       break;
637	 }
638
639	 /* We don't handle flow control here.  Most computation of
640	  * values that end up in MRFs are shortly before the MRF
641	  * write anyway.
642	  */
643	 if (scan_inst->opcode == BRW_OPCODE_DO ||
644	     scan_inst->opcode == BRW_OPCODE_WHILE ||
645	     scan_inst->opcode == BRW_OPCODE_ELSE ||
646	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
647	    break;
648	 }
649
650	 /* You can't read from an MRF, so if someone else reads our
651	  * MRF's source GRF that we wanted to rewrite, that stops us.
652	  */
653	 bool interfered = false;
654	 for (int i = 0; i < 3; i++) {
655	    if (scan_inst->src[i].file == GRF &&
656		scan_inst->src[i].reg == inst->src[0].reg &&
657		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
658	       interfered = true;
659	    }
660	 }
661	 if (interfered)
662	    break;
663
664	 /* If somebody else writes our MRF here, we can't
665	  * compute-to-MRF before that.
666	  */
667	 if (scan_inst->dst.file == MRF && mrf == scan_inst->dst.reg)
668	    break;
669
670	 if (scan_inst->mlen > 0) {
671	    /* Found a SEND instruction, which means that there are
672	     * live values in MRFs from base_mrf to base_mrf +
673	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
674	     * above it.
675	     */
676	    if (mrf >= scan_inst->base_mrf &&
677		mrf < scan_inst->base_mrf + scan_inst->mlen) {
678	       break;
679	    }
680	 }
681      }
682
683      if (chans_remaining == 0) {
684	 /* If we've made it here, we have an inst we want to
685	  * compute-to-MRF, and a scan_inst pointing to the earliest
686	  * instruction involved in computing the value.  Now go
687	  * rewrite the instruction stream between the two.
688	  */
689
690	 while (scan_inst != inst) {
691	    if (scan_inst->dst.file == GRF &&
692		scan_inst->dst.reg == inst->src[0].reg &&
693		scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
694	       scan_inst->dst.file = MRF;
695	       scan_inst->dst.reg = mrf;
696	       scan_inst->dst.reg_offset = 0;
697	       scan_inst->saturate |= inst->saturate;
698	    }
699	    scan_inst = (vec4_instruction *)scan_inst->next;
700	 }
701	 inst->remove();
702	 progress = true;
703      }
704   }
705
706   if (progress)
707      live_intervals_valid = false;
708
709   return progress;
710}
711
712} /* namespace brw */
713