1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Eric Anholt <eric@anholt.net>
25 *
26 */
27
28#include "brw_fs.h"
29#include "glsl/glsl_types.h"
30#include "glsl/ir_optimization.h"
31#include "glsl/ir_print_visitor.h"
32
33/** @file brw_fs_schedule_instructions.cpp
34 *
35 * List scheduling of FS instructions.
36 *
37 * The basic model of the list scheduler is to take a basic block,
38 * compute a DAG of the dependencies (RAW ordering with latency, WAW
39 * ordering, WAR ordering), and make a list of the DAG heads.
40 * Heuristically pick a DAG head, then put all the children that are
41 * now DAG heads into the list of things to schedule.
42 *
43 * The heuristic is the important part.  We're trying to be cheap,
44 * since actually computing the optimal scheduling is NP complete.
45 * What we do is track a "current clock".  When we schedule a node, we
46 * update the earliest-unblocked clock time of its children, and
47 * increment the clock.  Then, when trying to schedule, we just pick
48 * the earliest-unblocked instruction to schedule.
49 *
50 * Note that often there will be many things which could execute
51 * immediately, and there are a range of heuristic options to choose
52 * from in picking among those.
53 */
54
55class schedule_node : public exec_node
56{
57public:
58   schedule_node(fs_inst *inst)
59   {
60      this->inst = inst;
61      this->child_array_size = 0;
62      this->children = NULL;
63      this->child_latency = NULL;
64      this->child_count = 0;
65      this->parent_count = 0;
66      this->unblocked_time = 0;
67
68      int chans = 8;
69      int math_latency = 22;
70
71      switch (inst->opcode) {
72      case SHADER_OPCODE_RCP:
73	 this->latency = 1 * chans * math_latency;
74	 break;
75      case SHADER_OPCODE_RSQ:
76	 this->latency = 2 * chans * math_latency;
77	 break;
78      case SHADER_OPCODE_INT_QUOTIENT:
79      case SHADER_OPCODE_SQRT:
80      case SHADER_OPCODE_LOG2:
81	 /* full precision log.  partial is 2. */
82	 this->latency = 3 * chans * math_latency;
83	 break;
84      case SHADER_OPCODE_INT_REMAINDER:
85      case SHADER_OPCODE_EXP2:
86	 /* full precision.  partial is 3, same throughput. */
87	 this->latency = 4 * chans * math_latency;
88	 break;
89      case SHADER_OPCODE_POW:
90	 this->latency = 8 * chans * math_latency;
91	 break;
92      case SHADER_OPCODE_SIN:
93      case SHADER_OPCODE_COS:
94	 /* minimum latency, max is 12 rounds. */
95	 this->latency = 5 * chans * math_latency;
96	 break;
97      default:
98	 this->latency = 2;
99	 break;
100      }
101   }
102
103   fs_inst *inst;
104   schedule_node **children;
105   int *child_latency;
106   int child_count;
107   int parent_count;
108   int child_array_size;
109   int unblocked_time;
110   int latency;
111};
112
113class instruction_scheduler {
114public:
115   instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count)
116   {
117      this->v = v;
118      this->mem_ctx = ralloc_context(mem_ctx);
119      this->virtual_grf_count = virtual_grf_count;
120      this->instructions.make_empty();
121      this->instructions_to_schedule = 0;
122   }
123
124   ~instruction_scheduler()
125   {
126      ralloc_free(this->mem_ctx);
127   }
128   void add_barrier_deps(schedule_node *n);
129   void add_dep(schedule_node *before, schedule_node *after, int latency);
130   void add_dep(schedule_node *before, schedule_node *after);
131
132   void add_inst(fs_inst *inst);
133   void calculate_deps();
134   void schedule_instructions(fs_inst *next_block_header);
135
136   bool is_compressed(fs_inst *inst);
137
138   void *mem_ctx;
139
140   int instructions_to_schedule;
141   int virtual_grf_count;
142   exec_list instructions;
143   fs_visitor *v;
144};
145
146void
147instruction_scheduler::add_inst(fs_inst *inst)
148{
149   schedule_node *n = new(mem_ctx) schedule_node(inst);
150
151   assert(!inst->is_head_sentinel());
152   assert(!inst->is_tail_sentinel());
153
154   this->instructions_to_schedule++;
155
156   inst->remove();
157   instructions.push_tail(n);
158}
159
160/**
161 * Add a dependency between two instruction nodes.
162 *
163 * The @after node will be scheduled after @before.  We will try to
164 * schedule it @latency cycles after @before, but no guarantees there.
165 */
166void
167instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
168			       int latency)
169{
170   if (!before || !after)
171      return;
172
173   assert(before != after);
174
175   for (int i = 0; i < before->child_count; i++) {
176      if (before->children[i] == after) {
177	 before->child_latency[i] = MAX2(before->child_latency[i], latency);
178	 return;
179      }
180   }
181
182   if (before->child_array_size <= before->child_count) {
183      if (before->child_array_size < 16)
184	 before->child_array_size = 16;
185      else
186	 before->child_array_size *= 2;
187
188      before->children = reralloc(mem_ctx, before->children,
189				  schedule_node *,
190				  before->child_array_size);
191      before->child_latency = reralloc(mem_ctx, before->child_latency,
192				       int, before->child_array_size);
193   }
194
195   before->children[before->child_count] = after;
196   before->child_latency[before->child_count] = latency;
197   before->child_count++;
198   after->parent_count++;
199}
200
201void
202instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
203{
204   if (!before)
205      return;
206
207   add_dep(before, after, before->latency);
208}
209
210/**
211 * Sometimes we really want this node to execute after everything that
212 * was before it and before everything that followed it.  This adds
213 * the deps to do so.
214 */
215void
216instruction_scheduler::add_barrier_deps(schedule_node *n)
217{
218   schedule_node *prev = (schedule_node *)n->prev;
219   schedule_node *next = (schedule_node *)n->next;
220
221   if (prev) {
222      while (!prev->is_head_sentinel()) {
223	 add_dep(prev, n, 0);
224	 prev = (schedule_node *)prev->prev;
225      }
226   }
227
228   if (next) {
229      while (!next->is_tail_sentinel()) {
230	 add_dep(n, next, 0);
231	 next = (schedule_node *)next->next;
232      }
233   }
234}
235
236/* instruction scheduling needs to be aware of when an MRF write
237 * actually writes 2 MRFs.
238 */
239bool
240instruction_scheduler::is_compressed(fs_inst *inst)
241{
242   return (v->c->dispatch_width == 16 &&
243	   !inst->force_uncompressed &&
244	   !inst->force_sechalf);
245}
246
247void
248instruction_scheduler::calculate_deps()
249{
250   schedule_node *last_grf_write[virtual_grf_count];
251   schedule_node *last_mrf_write[BRW_MAX_MRF];
252   schedule_node *last_conditional_mod = NULL;
253   /* Fixed HW registers are assumed to be separate from the virtual
254    * GRFs, so they can be tracked separately.  We don't really write
255    * to fixed GRFs much, so don't bother tracking them on a more
256    * granular level.
257    */
258   schedule_node *last_fixed_grf_write = NULL;
259
260   /* The last instruction always needs to still be the last
261    * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
262    * WHILE) and scheduling other things after it would disturb the
263    * basic block, or it's FB_WRITE and we should do a better job at
264    * dead code elimination anyway.
265    */
266   schedule_node *last = (schedule_node *)instructions.get_tail();
267   add_barrier_deps(last);
268
269   memset(last_grf_write, 0, sizeof(last_grf_write));
270   memset(last_mrf_write, 0, sizeof(last_mrf_write));
271
272   /* top-to-bottom dependencies: RAW and WAW. */
273   foreach_list(node, &instructions) {
274      schedule_node *n = (schedule_node *)node;
275      fs_inst *inst = n->inst;
276
277      /* read-after-write deps. */
278      for (int i = 0; i < 3; i++) {
279	 if (inst->src[i].file == GRF) {
280	    add_dep(last_grf_write[inst->src[i].reg], n);
281	 } else if (inst->src[i].file == FIXED_HW_REG &&
282		    (inst->src[i].fixed_hw_reg.file ==
283		     BRW_GENERAL_REGISTER_FILE)) {
284	    add_dep(last_fixed_grf_write, n);
285	 } else if (inst->src[i].file != BAD_FILE &&
286		    inst->src[i].file != IMM &&
287		    inst->src[i].file != UNIFORM) {
288	    assert(inst->src[i].file != MRF);
289	    add_barrier_deps(n);
290	 }
291      }
292
293      for (int i = 0; i < inst->mlen; i++) {
294	 /* It looks like the MRF regs are released in the send
295	  * instruction once it's sent, not when the result comes
296	  * back.
297	  */
298	 add_dep(last_mrf_write[inst->base_mrf + i], n);
299      }
300
301      if (inst->predicated) {
302	 assert(last_conditional_mod);
303	 add_dep(last_conditional_mod, n);
304      }
305
306      /* write-after-write deps. */
307      if (inst->dst.file == GRF) {
308	 add_dep(last_grf_write[inst->dst.reg], n);
309	 last_grf_write[inst->dst.reg] = n;
310      } else if (inst->dst.file == MRF) {
311	 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
312
313	 add_dep(last_mrf_write[reg], n);
314	 last_mrf_write[reg] = n;
315	 if (is_compressed(inst)) {
316	    if (inst->dst.reg & BRW_MRF_COMPR4)
317	       reg += 4;
318	    else
319	       reg++;
320	    add_dep(last_mrf_write[reg], n);
321	    last_mrf_write[reg] = n;
322	 }
323      } else if (inst->dst.file == FIXED_HW_REG &&
324		 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
325	 last_fixed_grf_write = n;
326      } else if (inst->dst.file != BAD_FILE) {
327	 add_barrier_deps(n);
328      }
329
330      if (inst->mlen > 0) {
331	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
332	    add_dep(last_mrf_write[inst->base_mrf + i], n);
333	    last_mrf_write[inst->base_mrf + i] = n;
334	 }
335      }
336
337      /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
338       * conditional_mod, because it sets the flag register.
339       */
340      if (inst->conditional_mod ||
341          inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
342	 add_dep(last_conditional_mod, n, 0);
343	 last_conditional_mod = n;
344      }
345   }
346
347   /* bottom-to-top dependencies: WAR */
348   memset(last_grf_write, 0, sizeof(last_grf_write));
349   memset(last_mrf_write, 0, sizeof(last_mrf_write));
350   last_conditional_mod = NULL;
351   last_fixed_grf_write = NULL;
352
353   exec_node *node;
354   exec_node *prev;
355   for (node = instructions.get_tail(), prev = node->prev;
356	!node->is_head_sentinel();
357	node = prev, prev = node->prev) {
358      schedule_node *n = (schedule_node *)node;
359      fs_inst *inst = n->inst;
360
361      /* write-after-read deps. */
362      for (int i = 0; i < 3; i++) {
363	 if (inst->src[i].file == GRF) {
364	    add_dep(n, last_grf_write[inst->src[i].reg]);
365	 } else if (inst->src[i].file == FIXED_HW_REG &&
366		    (inst->src[i].fixed_hw_reg.file ==
367		     BRW_GENERAL_REGISTER_FILE)) {
368	    add_dep(n, last_fixed_grf_write);
369	 } else if (inst->src[i].file != BAD_FILE &&
370		    inst->src[i].file != IMM &&
371		    inst->src[i].file != UNIFORM) {
372	    assert(inst->src[i].file != MRF);
373	    add_barrier_deps(n);
374	 }
375      }
376
377      for (int i = 0; i < inst->mlen; i++) {
378	 /* It looks like the MRF regs are released in the send
379	  * instruction once it's sent, not when the result comes
380	  * back.
381	  */
382	 add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
383      }
384
385      if (inst->predicated) {
386	 add_dep(n, last_conditional_mod);
387      }
388
389      /* Update the things this instruction wrote, so earlier reads
390       * can mark this as WAR dependency.
391       */
392      if (inst->dst.file == GRF) {
393	 last_grf_write[inst->dst.reg] = n;
394      } else if (inst->dst.file == MRF) {
395	 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
396
397	 last_mrf_write[reg] = n;
398
399	 if (is_compressed(inst)) {
400	    if (inst->dst.reg & BRW_MRF_COMPR4)
401	       reg += 4;
402	    else
403	       reg++;
404
405	    last_mrf_write[reg] = n;
406	 }
407      } else if (inst->dst.file == FIXED_HW_REG &&
408		 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
409	 last_fixed_grf_write = n;
410      } else if (inst->dst.file != BAD_FILE) {
411	 add_barrier_deps(n);
412      }
413
414      if (inst->mlen > 0) {
415	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
416	    last_mrf_write[inst->base_mrf + i] = n;
417	 }
418      }
419
420      /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
421       * conditional_mod, because it sets the flag register.
422       */
423      if (inst->conditional_mod ||
424          inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
425	 last_conditional_mod = n;
426      }
427   }
428}
429
430void
431instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
432{
433   int time = 0;
434
435   /* Remove non-DAG heads from the list. */
436   foreach_list_safe(node, &instructions) {
437      schedule_node *n = (schedule_node *)node;
438      if (n->parent_count != 0)
439	 n->remove();
440   }
441
442   while (!instructions.is_empty()) {
443      schedule_node *chosen = NULL;
444      int chosen_time = 0;
445
446      foreach_list(node, &instructions) {
447	 schedule_node *n = (schedule_node *)node;
448
449	 if (!chosen || n->unblocked_time < chosen_time) {
450	    chosen = n;
451	    chosen_time = n->unblocked_time;
452	 }
453      }
454
455      /* Schedule this instruction. */
456      assert(chosen);
457      chosen->remove();
458      next_block_header->insert_before(chosen->inst);
459      instructions_to_schedule--;
460
461      /* Bump the clock.  If we expected a delay for scheduling, then
462       * bump the clock to reflect that.
463       */
464      time = MAX2(time + 1, chosen_time);
465
466      /* Now that we've scheduled a new instruction, some of its
467       * children can be promoted to the list of instructions ready to
468       * be scheduled.  Update the children's unblocked time for this
469       * DAG edge as we do so.
470       */
471      for (int i = 0; i < chosen->child_count; i++) {
472	 schedule_node *child = chosen->children[i];
473
474	 child->unblocked_time = MAX2(child->unblocked_time,
475				      time + chosen->child_latency[i]);
476
477	 child->parent_count--;
478	 if (child->parent_count == 0) {
479	    instructions.push_tail(child);
480	 }
481      }
482
483      /* Shared resource: the mathbox.  There's one per EU (on later
484       * generations, it's even more limited pre-gen6), so if we send
485       * something off to it then the next math isn't going to make
486       * progress until the first is done.
487       */
488      if (chosen->inst->is_math()) {
489	 foreach_list(node, &instructions) {
490	    schedule_node *n = (schedule_node *)node;
491
492	    if (n->inst->is_math())
493	       n->unblocked_time = MAX2(n->unblocked_time,
494					time + chosen->latency);
495	 }
496      }
497   }
498
499   assert(instructions_to_schedule == 0);
500}
501
502void
503fs_visitor::schedule_instructions()
504{
505   fs_inst *next_block_header = (fs_inst *)instructions.head;
506   instruction_scheduler sched(this, mem_ctx, this->virtual_grf_count);
507
508   while (!next_block_header->is_tail_sentinel()) {
509      /* Add things to be scheduled until we get to a new BB. */
510      while (!next_block_header->is_tail_sentinel()) {
511	 fs_inst *inst = next_block_header;
512	 next_block_header = (fs_inst *)next_block_header->next;
513
514	 sched.add_inst(inst);
515	 if (inst->opcode == BRW_OPCODE_IF ||
516	     inst->opcode == BRW_OPCODE_ELSE ||
517	     inst->opcode == BRW_OPCODE_ENDIF ||
518	     inst->opcode == BRW_OPCODE_DO ||
519	     inst->opcode == BRW_OPCODE_WHILE ||
520	     inst->opcode == BRW_OPCODE_BREAK ||
521	     inst->opcode == BRW_OPCODE_CONTINUE) {
522	    break;
523	 }
524      }
525      sched.calculate_deps();
526      sched.schedule_instructions(next_block_header);
527   }
528
529   this->live_intervals_valid = false;
530}
531