brw_vec4_reg_allocate.cpp revision b22de71c1bc2530e139d75d934e203f4eee89f41
1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24extern "C" { 25#include "main/macros.h" 26#include "program/register_allocate.h" 27} /* extern "C" */ 28 29#include "brw_vec4.h" 30#include "glsl/ir_print_visitor.h" 31 32using namespace brw; 33 34namespace brw { 35 36static void 37assign(unsigned int *reg_hw_locations, reg *reg) 38{ 39 if (reg->file == GRF) { 40 reg->reg = reg_hw_locations[reg->reg]; 41 } 42} 43 44void 45vec4_visitor::reg_allocate_trivial() 46{ 47 unsigned int hw_reg_mapping[this->virtual_grf_count]; 48 bool virtual_grf_used[this->virtual_grf_count]; 49 int i; 50 int next; 51 52 /* Calculate which virtual GRFs are actually in use after whatever 53 * optimization passes have occurred. 54 */ 55 for (int i = 0; i < this->virtual_grf_count; i++) { 56 virtual_grf_used[i] = false; 57 } 58 59 foreach_iter(exec_list_iterator, iter, this->instructions) { 60 vec4_instruction *inst = (vec4_instruction *)iter.get(); 61 62 if (inst->dst.file == GRF) 63 virtual_grf_used[inst->dst.reg] = true; 64 65 for (int i = 0; i < 3; i++) { 66 if (inst->src[i].file == GRF) 67 virtual_grf_used[inst->src[i].reg] = true; 68 } 69 } 70 71 hw_reg_mapping[0] = this->first_non_payload_grf; 72 next = hw_reg_mapping[0] + this->virtual_grf_sizes[0]; 73 for (i = 1; i < this->virtual_grf_count; i++) { 74 if (virtual_grf_used[i]) { 75 hw_reg_mapping[i] = next; 76 next += this->virtual_grf_sizes[i]; 77 } 78 } 79 prog_data->total_grf = next; 80 81 foreach_iter(exec_list_iterator, iter, this->instructions) { 82 vec4_instruction *inst = (vec4_instruction *)iter.get(); 83 84 assign(hw_reg_mapping, &inst->dst); 85 assign(hw_reg_mapping, &inst->src[0]); 86 assign(hw_reg_mapping, &inst->src[1]); 87 assign(hw_reg_mapping, &inst->src[2]); 88 } 89 90 if (prog_data->total_grf > max_grf) { 91 fail("Ran out of regs on trivial allocator (%d/%d)\n", 92 prog_data->total_grf, max_grf); 93 } 94} 95 96static void 97brw_alloc_reg_set_for_classes(struct brw_context *brw, 98 int *class_sizes, 99 int class_count, 100 int base_reg_count) 101{ 102 /* Compute the total number of registers across all classes. */ 103 int ra_reg_count = 0; 104 for (int i = 0; i < class_count; i++) { 105 ra_reg_count += base_reg_count - (class_sizes[i] - 1); 106 } 107 108 ralloc_free(brw->vs.ra_reg_to_grf); 109 brw->vs.ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count); 110 ralloc_free(brw->vs.regs); 111 brw->vs.regs = ra_alloc_reg_set(brw, ra_reg_count); 112 ralloc_free(brw->vs.classes); 113 brw->vs.classes = ralloc_array(brw, int, class_count + 1); 114 115 /* Now, add the registers to their classes, and add the conflicts 116 * between them and the base GRF registers (and also each other). 117 */ 118 int reg = 0; 119 for (int i = 0; i < class_count; i++) { 120 int class_reg_count = base_reg_count - (class_sizes[i] - 1); 121 brw->vs.classes[i] = ra_alloc_reg_class(brw->vs.regs); 122 123 for (int j = 0; j < class_reg_count; j++) { 124 ra_class_add_reg(brw->vs.regs, brw->vs.classes[i], reg); 125 126 brw->vs.ra_reg_to_grf[reg] = j; 127 128 for (int base_reg = j; 129 base_reg < j + class_sizes[i]; 130 base_reg++) { 131 ra_add_transitive_reg_conflict(brw->vs.regs, base_reg, reg); 132 } 133 134 reg++; 135 } 136 } 137 assert(reg == ra_reg_count); 138 139 ra_set_finalize(brw->vs.regs); 140} 141 142void 143vec4_visitor::reg_allocate() 144{ 145 unsigned int hw_reg_mapping[virtual_grf_count]; 146 int first_assigned_grf = this->first_non_payload_grf; 147 int base_reg_count = max_grf - first_assigned_grf; 148 int class_sizes[base_reg_count]; 149 int class_count = 0; 150 151 /* Using the trivial allocator can be useful in debugging undefined 152 * register access as a result of broken optimization passes. 153 */ 154 if (0) { 155 reg_allocate_trivial(); 156 return; 157 } 158 159 calculate_live_intervals(); 160 161 /* Set up the register classes. 162 * 163 * The base registers store a vec4. However, we'll need larger 164 * storage for arrays, structures, and matrices, which will be sets 165 * of contiguous registers. 166 */ 167 class_sizes[class_count++] = 1; 168 169 for (int r = 0; r < virtual_grf_count; r++) { 170 int i; 171 172 for (i = 0; i < class_count; i++) { 173 if (class_sizes[i] == this->virtual_grf_sizes[r]) 174 break; 175 } 176 if (i == class_count) { 177 if (this->virtual_grf_sizes[r] >= base_reg_count) { 178 fail("Object too large to register allocate.\n"); 179 } 180 181 class_sizes[class_count++] = this->virtual_grf_sizes[r]; 182 } 183 } 184 185 brw_alloc_reg_set_for_classes(brw, class_sizes, class_count, base_reg_count); 186 187 struct ra_graph *g = ra_alloc_interference_graph(brw->vs.regs, 188 virtual_grf_count); 189 190 for (int i = 0; i < virtual_grf_count; i++) { 191 for (int c = 0; c < class_count; c++) { 192 if (class_sizes[c] == this->virtual_grf_sizes[i]) { 193 ra_set_node_class(g, i, brw->vs.classes[c]); 194 break; 195 } 196 } 197 198 for (int j = 0; j < i; j++) { 199 if (virtual_grf_interferes(i, j)) { 200 ra_add_node_interference(g, i, j); 201 } 202 } 203 } 204 205 if (!ra_allocate_no_spills(g)) { 206 /* Failed to allocate registers. Spill a reg, and the caller will 207 * loop back into here to try again. 208 */ 209 int reg = choose_spill_reg(g); 210 if (reg == -1) { 211 fail("no register to spill\n"); 212 } else { 213 spill_reg(reg); 214 } 215 ralloc_free(g); 216 return; 217 } 218 219 /* Get the chosen virtual registers for each node, and map virtual 220 * regs in the register classes back down to real hardware reg 221 * numbers. 222 */ 223 prog_data->total_grf = first_assigned_grf; 224 for (int i = 0; i < virtual_grf_count; i++) { 225 int reg = ra_get_node_reg(g, i); 226 227 hw_reg_mapping[i] = first_assigned_grf + brw->vs.ra_reg_to_grf[reg]; 228 prog_data->total_grf = MAX2(prog_data->total_grf, 229 hw_reg_mapping[i] + virtual_grf_sizes[i]); 230 } 231 232 foreach_list(node, &this->instructions) { 233 vec4_instruction *inst = (vec4_instruction *)node; 234 235 assign(hw_reg_mapping, &inst->dst); 236 assign(hw_reg_mapping, &inst->src[0]); 237 assign(hw_reg_mapping, &inst->src[1]); 238 assign(hw_reg_mapping, &inst->src[2]); 239 } 240 241 ralloc_free(g); 242} 243 244void 245vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill) 246{ 247 float loop_scale = 1.0; 248 249 for (int i = 0; i < this->virtual_grf_count; i++) { 250 spill_costs[i] = 0.0; 251 no_spill[i] = virtual_grf_sizes[i] != 1; 252 } 253 254 /* Calculate costs for spilling nodes. Call it a cost of 1 per 255 * spill/unspill we'll have to do, and guess that the insides of 256 * loops run 10 times. 257 */ 258 foreach_list(node, &this->instructions) { 259 vec4_instruction *inst = (vec4_instruction *) node; 260 261 for (unsigned int i = 0; i < 3; i++) { 262 if (inst->src[i].file == GRF) { 263 spill_costs[inst->src[i].reg] += loop_scale; 264 if (inst->src[i].reladdr) 265 no_spill[inst->src[i].reg] = true; 266 } 267 } 268 269 if (inst->dst.file == GRF) { 270 spill_costs[inst->dst.reg] += loop_scale; 271 if (inst->dst.reladdr) 272 no_spill[inst->dst.reg] = true; 273 } 274 275 switch (inst->opcode) { 276 277 case BRW_OPCODE_DO: 278 loop_scale *= 10; 279 break; 280 281 case BRW_OPCODE_WHILE: 282 loop_scale /= 10; 283 break; 284 285 case VS_OPCODE_SCRATCH_READ: 286 case VS_OPCODE_SCRATCH_WRITE: 287 for (int i = 0; i < 3; i++) { 288 if (inst->src[i].file == GRF) 289 no_spill[inst->src[i].reg] = true; 290 } 291 if (inst->dst.file == GRF) 292 no_spill[inst->dst.reg] = true; 293 break; 294 295 default: 296 break; 297 } 298 } 299} 300 301int 302vec4_visitor::choose_spill_reg(struct ra_graph *g) 303{ 304 float spill_costs[this->virtual_grf_count]; 305 bool no_spill[this->virtual_grf_count]; 306 307 evaluate_spill_costs(spill_costs, no_spill); 308 309 for (int i = 0; i < this->virtual_grf_count; i++) { 310 if (!no_spill[i]) 311 ra_set_node_spill_cost(g, i, spill_costs[i]); 312 } 313 314 return ra_get_best_spill_node(g); 315} 316 317void 318vec4_visitor::spill_reg(int spill_reg_nr) 319{ 320 assert(virtual_grf_sizes[spill_reg_nr] == 1); 321 unsigned int spill_offset = c->last_scratch++; 322 323 /* Generate spill/unspill instructions for the objects being spilled. */ 324 foreach_list(node, &this->instructions) { 325 vec4_instruction *inst = (vec4_instruction *) node; 326 327 for (unsigned int i = 0; i < 3; i++) { 328 if (inst->src[i].file == GRF && inst->src[i].reg == spill_reg_nr) { 329 src_reg spill_reg = inst->src[i]; 330 inst->src[i].reg = virtual_grf_alloc(1); 331 dst_reg temp = dst_reg(inst->src[i]); 332 333 /* Only read the necessary channels, to avoid overwriting the rest 334 * with data that may not have been written to scratch. 335 */ 336 temp.writemask = 0; 337 for (int c = 0; c < 4; c++) 338 temp.writemask |= (1 << BRW_GET_SWZ(inst->src[i].swizzle, c)); 339 assert(temp.writemask != 0); 340 341 emit_scratch_read(inst, temp, spill_reg, spill_offset); 342 } 343 } 344 345 if (inst->dst.file == GRF && inst->dst.reg == spill_reg_nr) { 346 dst_reg spill_reg = inst->dst; 347 inst->dst.reg = virtual_grf_alloc(1); 348 349 /* We don't want a swizzle when reading from the source; read the 350 * whole register and use spill_reg's writemask to select which 351 * channels to write. 352 */ 353 src_reg temp = src_reg(inst->dst); 354 temp.swizzle = BRW_SWIZZLE_XYZW; 355 emit_scratch_write(inst, temp, spill_reg, spill_offset); 356 } 357 } 358 359 this->live_intervals_valid = false; 360} 361 362} /* namespace brw */ 363