brw_vec4_reg_allocate.cpp revision b22de71c1bc2530e139d75d934e203f4eee89f41
1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24extern "C" {
25#include "main/macros.h"
26#include "program/register_allocate.h"
27} /* extern "C" */
28
29#include "brw_vec4.h"
30#include "glsl/ir_print_visitor.h"
31
32using namespace brw;
33
34namespace brw {
35
36static void
37assign(unsigned int *reg_hw_locations, reg *reg)
38{
39   if (reg->file == GRF) {
40      reg->reg = reg_hw_locations[reg->reg];
41   }
42}
43
44void
45vec4_visitor::reg_allocate_trivial()
46{
47   unsigned int hw_reg_mapping[this->virtual_grf_count];
48   bool virtual_grf_used[this->virtual_grf_count];
49   int i;
50   int next;
51
52   /* Calculate which virtual GRFs are actually in use after whatever
53    * optimization passes have occurred.
54    */
55   for (int i = 0; i < this->virtual_grf_count; i++) {
56      virtual_grf_used[i] = false;
57   }
58
59   foreach_iter(exec_list_iterator, iter, this->instructions) {
60      vec4_instruction *inst = (vec4_instruction *)iter.get();
61
62      if (inst->dst.file == GRF)
63	 virtual_grf_used[inst->dst.reg] = true;
64
65      for (int i = 0; i < 3; i++) {
66	 if (inst->src[i].file == GRF)
67	    virtual_grf_used[inst->src[i].reg] = true;
68      }
69   }
70
71   hw_reg_mapping[0] = this->first_non_payload_grf;
72   next = hw_reg_mapping[0] + this->virtual_grf_sizes[0];
73   for (i = 1; i < this->virtual_grf_count; i++) {
74      if (virtual_grf_used[i]) {
75	 hw_reg_mapping[i] = next;
76	 next += this->virtual_grf_sizes[i];
77      }
78   }
79   prog_data->total_grf = next;
80
81   foreach_iter(exec_list_iterator, iter, this->instructions) {
82      vec4_instruction *inst = (vec4_instruction *)iter.get();
83
84      assign(hw_reg_mapping, &inst->dst);
85      assign(hw_reg_mapping, &inst->src[0]);
86      assign(hw_reg_mapping, &inst->src[1]);
87      assign(hw_reg_mapping, &inst->src[2]);
88   }
89
90   if (prog_data->total_grf > max_grf) {
91      fail("Ran out of regs on trivial allocator (%d/%d)\n",
92	   prog_data->total_grf, max_grf);
93   }
94}
95
96static void
97brw_alloc_reg_set_for_classes(struct brw_context *brw,
98			      int *class_sizes,
99			      int class_count,
100			      int base_reg_count)
101{
102   /* Compute the total number of registers across all classes. */
103   int ra_reg_count = 0;
104   for (int i = 0; i < class_count; i++) {
105      ra_reg_count += base_reg_count - (class_sizes[i] - 1);
106   }
107
108   ralloc_free(brw->vs.ra_reg_to_grf);
109   brw->vs.ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
110   ralloc_free(brw->vs.regs);
111   brw->vs.regs = ra_alloc_reg_set(brw, ra_reg_count);
112   ralloc_free(brw->vs.classes);
113   brw->vs.classes = ralloc_array(brw, int, class_count + 1);
114
115   /* Now, add the registers to their classes, and add the conflicts
116    * between them and the base GRF registers (and also each other).
117    */
118   int reg = 0;
119   for (int i = 0; i < class_count; i++) {
120      int class_reg_count = base_reg_count - (class_sizes[i] - 1);
121      brw->vs.classes[i] = ra_alloc_reg_class(brw->vs.regs);
122
123      for (int j = 0; j < class_reg_count; j++) {
124	 ra_class_add_reg(brw->vs.regs, brw->vs.classes[i], reg);
125
126	 brw->vs.ra_reg_to_grf[reg] = j;
127
128	 for (int base_reg = j;
129	      base_reg < j + class_sizes[i];
130	      base_reg++) {
131	    ra_add_transitive_reg_conflict(brw->vs.regs, base_reg, reg);
132	 }
133
134	 reg++;
135      }
136   }
137   assert(reg == ra_reg_count);
138
139   ra_set_finalize(brw->vs.regs);
140}
141
142void
143vec4_visitor::reg_allocate()
144{
145   unsigned int hw_reg_mapping[virtual_grf_count];
146   int first_assigned_grf = this->first_non_payload_grf;
147   int base_reg_count = max_grf - first_assigned_grf;
148   int class_sizes[base_reg_count];
149   int class_count = 0;
150
151   /* Using the trivial allocator can be useful in debugging undefined
152    * register access as a result of broken optimization passes.
153    */
154   if (0) {
155      reg_allocate_trivial();
156      return;
157   }
158
159   calculate_live_intervals();
160
161   /* Set up the register classes.
162    *
163    * The base registers store a vec4.  However, we'll need larger
164    * storage for arrays, structures, and matrices, which will be sets
165    * of contiguous registers.
166    */
167   class_sizes[class_count++] = 1;
168
169   for (int r = 0; r < virtual_grf_count; r++) {
170      int i;
171
172      for (i = 0; i < class_count; i++) {
173	 if (class_sizes[i] == this->virtual_grf_sizes[r])
174	    break;
175      }
176      if (i == class_count) {
177	 if (this->virtual_grf_sizes[r] >= base_reg_count) {
178	    fail("Object too large to register allocate.\n");
179	 }
180
181	 class_sizes[class_count++] = this->virtual_grf_sizes[r];
182      }
183   }
184
185   brw_alloc_reg_set_for_classes(brw, class_sizes, class_count, base_reg_count);
186
187   struct ra_graph *g = ra_alloc_interference_graph(brw->vs.regs,
188						    virtual_grf_count);
189
190   for (int i = 0; i < virtual_grf_count; i++) {
191      for (int c = 0; c < class_count; c++) {
192	 if (class_sizes[c] == this->virtual_grf_sizes[i]) {
193	    ra_set_node_class(g, i, brw->vs.classes[c]);
194	    break;
195	 }
196      }
197
198      for (int j = 0; j < i; j++) {
199	 if (virtual_grf_interferes(i, j)) {
200	    ra_add_node_interference(g, i, j);
201	 }
202      }
203   }
204
205   if (!ra_allocate_no_spills(g)) {
206      /* Failed to allocate registers.  Spill a reg, and the caller will
207       * loop back into here to try again.
208       */
209      int reg = choose_spill_reg(g);
210      if (reg == -1) {
211         fail("no register to spill\n");
212      } else {
213         spill_reg(reg);
214      }
215      ralloc_free(g);
216      return;
217   }
218
219   /* Get the chosen virtual registers for each node, and map virtual
220    * regs in the register classes back down to real hardware reg
221    * numbers.
222    */
223   prog_data->total_grf = first_assigned_grf;
224   for (int i = 0; i < virtual_grf_count; i++) {
225      int reg = ra_get_node_reg(g, i);
226
227      hw_reg_mapping[i] = first_assigned_grf + brw->vs.ra_reg_to_grf[reg];
228      prog_data->total_grf = MAX2(prog_data->total_grf,
229				  hw_reg_mapping[i] + virtual_grf_sizes[i]);
230   }
231
232   foreach_list(node, &this->instructions) {
233      vec4_instruction *inst = (vec4_instruction *)node;
234
235      assign(hw_reg_mapping, &inst->dst);
236      assign(hw_reg_mapping, &inst->src[0]);
237      assign(hw_reg_mapping, &inst->src[1]);
238      assign(hw_reg_mapping, &inst->src[2]);
239   }
240
241   ralloc_free(g);
242}
243
244void
245vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
246{
247   float loop_scale = 1.0;
248
249   for (int i = 0; i < this->virtual_grf_count; i++) {
250      spill_costs[i] = 0.0;
251      no_spill[i] = virtual_grf_sizes[i] != 1;
252   }
253
254   /* Calculate costs for spilling nodes.  Call it a cost of 1 per
255    * spill/unspill we'll have to do, and guess that the insides of
256    * loops run 10 times.
257    */
258   foreach_list(node, &this->instructions) {
259      vec4_instruction *inst = (vec4_instruction *) node;
260
261      for (unsigned int i = 0; i < 3; i++) {
262	 if (inst->src[i].file == GRF) {
263	    spill_costs[inst->src[i].reg] += loop_scale;
264            if (inst->src[i].reladdr)
265               no_spill[inst->src[i].reg] = true;
266	 }
267      }
268
269      if (inst->dst.file == GRF) {
270	 spill_costs[inst->dst.reg] += loop_scale;
271         if (inst->dst.reladdr)
272            no_spill[inst->dst.reg] = true;
273      }
274
275      switch (inst->opcode) {
276
277      case BRW_OPCODE_DO:
278	 loop_scale *= 10;
279	 break;
280
281      case BRW_OPCODE_WHILE:
282	 loop_scale /= 10;
283	 break;
284
285      case VS_OPCODE_SCRATCH_READ:
286      case VS_OPCODE_SCRATCH_WRITE:
287         for (int i = 0; i < 3; i++) {
288            if (inst->src[i].file == GRF)
289               no_spill[inst->src[i].reg] = true;
290         }
291	 if (inst->dst.file == GRF)
292	    no_spill[inst->dst.reg] = true;
293	 break;
294
295      default:
296	 break;
297      }
298   }
299}
300
301int
302vec4_visitor::choose_spill_reg(struct ra_graph *g)
303{
304   float spill_costs[this->virtual_grf_count];
305   bool no_spill[this->virtual_grf_count];
306
307   evaluate_spill_costs(spill_costs, no_spill);
308
309   for (int i = 0; i < this->virtual_grf_count; i++) {
310      if (!no_spill[i])
311         ra_set_node_spill_cost(g, i, spill_costs[i]);
312   }
313
314   return ra_get_best_spill_node(g);
315}
316
317void
318vec4_visitor::spill_reg(int spill_reg_nr)
319{
320   assert(virtual_grf_sizes[spill_reg_nr] == 1);
321   unsigned int spill_offset = c->last_scratch++;
322
323   /* Generate spill/unspill instructions for the objects being spilled. */
324   foreach_list(node, &this->instructions) {
325      vec4_instruction *inst = (vec4_instruction *) node;
326
327      for (unsigned int i = 0; i < 3; i++) {
328         if (inst->src[i].file == GRF && inst->src[i].reg == spill_reg_nr) {
329            src_reg spill_reg = inst->src[i];
330            inst->src[i].reg = virtual_grf_alloc(1);
331            dst_reg temp = dst_reg(inst->src[i]);
332
333            /* Only read the necessary channels, to avoid overwriting the rest
334             * with data that may not have been written to scratch.
335             */
336            temp.writemask = 0;
337            for (int c = 0; c < 4; c++)
338               temp.writemask |= (1 << BRW_GET_SWZ(inst->src[i].swizzle, c));
339            assert(temp.writemask != 0);
340
341            emit_scratch_read(inst, temp, spill_reg, spill_offset);
342         }
343      }
344
345      if (inst->dst.file == GRF && inst->dst.reg == spill_reg_nr) {
346         dst_reg spill_reg = inst->dst;
347         inst->dst.reg = virtual_grf_alloc(1);
348
349         /* We don't want a swizzle when reading from the source; read the
350          * whole register and use spill_reg's writemask to select which
351          * channels to write.
352          */
353         src_reg temp = src_reg(inst->dst);
354         temp.swizzle = BRW_SWIZZLE_XYZW;
355         emit_scratch_write(inst, temp, spill_reg, spill_offset);
356      }
357   }
358
359   this->live_intervals_valid = false;
360}
361
362} /* namespace brw */
363