1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_vec4.h" 25#include "brw_cfg.h" 26#include "brw_eu.h" 27#include "brw_program.h" 28 29namespace brw { 30 31vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst, 32 const src_reg &src0, const src_reg &src1, 33 const src_reg &src2) 34{ 35 this->opcode = opcode; 36 this->dst = dst; 37 this->src[0] = src0; 38 this->src[1] = src1; 39 this->src[2] = src2; 40 this->saturate = false; 41 this->force_writemask_all = false; 42 this->no_dd_clear = false; 43 this->no_dd_check = false; 44 this->writes_accumulator = false; 45 this->conditional_mod = BRW_CONDITIONAL_NONE; 46 this->predicate = BRW_PREDICATE_NONE; 47 this->predicate_inverse = false; 48 this->target = 0; 49 this->shadow_compare = false; 50 this->ir = NULL; 51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; 52 this->header_size = 0; 53 this->flag_subreg = 0; 54 this->mlen = 0; 55 this->base_mrf = 0; 56 this->offset = 0; 57 this->exec_size = 8; 58 this->group = 0; 59 this->size_written = (dst.file == BAD_FILE ? 60 0 : this->exec_size * type_sz(dst.type)); 61 this->annotation = NULL; 62} 63 64vec4_instruction * 65vec4_visitor::emit(vec4_instruction *inst) 66{ 67 inst->ir = this->base_ir; 68 inst->annotation = this->current_annotation; 69 70 this->instructions.push_tail(inst); 71 72 return inst; 73} 74 75vec4_instruction * 76vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst, 77 vec4_instruction *new_inst) 78{ 79 new_inst->ir = inst->ir; 80 new_inst->annotation = inst->annotation; 81 82 inst->insert_before(block, new_inst); 83 84 return inst; 85} 86 87vec4_instruction * 88vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 89 const src_reg &src1, const src_reg &src2) 90{ 91 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2)); 92} 93 94 95vec4_instruction * 96vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 97 const src_reg &src1) 98{ 99 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1)); 100} 101 102vec4_instruction * 103vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) 104{ 105 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0)); 106} 107 108vec4_instruction * 109vec4_visitor::emit(enum opcode opcode, const dst_reg &dst) 110{ 111 return emit(new(mem_ctx) vec4_instruction(opcode, dst)); 112} 113 114vec4_instruction * 115vec4_visitor::emit(enum opcode opcode) 116{ 117 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg())); 118} 119 120#define ALU1(op) \ 121 vec4_instruction * \ 122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \ 123 { \ 124 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \ 125 } 126 127#define ALU2(op) \ 128 vec4_instruction * \ 129 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 130 const src_reg &src1) \ 131 { \ 132 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ 133 src0, src1); \ 134 } 135 136#define ALU2_ACC(op) \ 137 vec4_instruction * \ 138 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 139 const src_reg &src1) \ 140 { \ 141 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \ 142 BRW_OPCODE_##op, dst, src0, src1); \ 143 inst->writes_accumulator = true; \ 144 return inst; \ 145 } 146 147#define ALU3(op) \ 148 vec4_instruction * \ 149 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 150 const src_reg &src1, const src_reg &src2) \ 151 { \ 152 assert(devinfo->gen >= 6); \ 153 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ 154 src0, src1, src2); \ 155 } 156 157ALU1(NOT) 158ALU1(MOV) 159ALU1(FRC) 160ALU1(RNDD) 161ALU1(RNDE) 162ALU1(RNDZ) 163ALU1(F32TO16) 164ALU1(F16TO32) 165ALU2(ADD) 166ALU2(MUL) 167ALU2_ACC(MACH) 168ALU2(AND) 169ALU2(OR) 170ALU2(XOR) 171ALU2(DP3) 172ALU2(DP4) 173ALU2(DPH) 174ALU2(SHL) 175ALU2(SHR) 176ALU2(ASR) 177ALU3(LRP) 178ALU1(BFREV) 179ALU3(BFE) 180ALU2(BFI1) 181ALU3(BFI2) 182ALU1(FBH) 183ALU1(FBL) 184ALU1(CBIT) 185ALU3(MAD) 186ALU2_ACC(ADDC) 187ALU2_ACC(SUBB) 188ALU2(MAC) 189ALU1(DIM) 190 191/** Gen4 predicated IF. */ 192vec4_instruction * 193vec4_visitor::IF(enum brw_predicate predicate) 194{ 195 vec4_instruction *inst; 196 197 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF); 198 inst->predicate = predicate; 199 200 return inst; 201} 202 203/** Gen6 IF with embedded comparison. */ 204vec4_instruction * 205vec4_visitor::IF(src_reg src0, src_reg src1, 206 enum brw_conditional_mod condition) 207{ 208 assert(devinfo->gen == 6); 209 210 vec4_instruction *inst; 211 212 resolve_ud_negate(&src0); 213 resolve_ud_negate(&src1); 214 215 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(), 216 src0, src1); 217 inst->conditional_mod = condition; 218 219 return inst; 220} 221 222/** 223 * CMP: Sets the low bit of the destination channels with the result 224 * of the comparison, while the upper bits are undefined, and updates 225 * the flag register with the packed 16 bits of the result. 226 */ 227vec4_instruction * 228vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, 229 enum brw_conditional_mod condition) 230{ 231 vec4_instruction *inst; 232 233 /* Take the instruction: 234 * 235 * CMP null<d> src0<f> src1<f> 236 * 237 * Original gen4 does type conversion to the destination type before 238 * comparison, producing garbage results for floating point comparisons. 239 * 240 * The destination type doesn't matter on newer generations, so we set the 241 * type to match src0 so we can compact the instruction. 242 */ 243 dst.type = src0.type; 244 245 resolve_ud_negate(&src0); 246 resolve_ud_negate(&src1); 247 248 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1); 249 inst->conditional_mod = condition; 250 251 return inst; 252} 253 254vec4_instruction * 255vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index) 256{ 257 vec4_instruction *inst; 258 259 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ, 260 dst, index); 261 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1; 262 inst->mlen = 2; 263 264 return inst; 265} 266 267vec4_instruction * 268vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src, 269 const src_reg &index) 270{ 271 vec4_instruction *inst; 272 273 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE, 274 dst, src, index); 275 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen); 276 inst->mlen = 3; 277 278 return inst; 279} 280 281src_reg 282vec4_visitor::fix_3src_operand(const src_reg &src) 283{ 284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be 285 * able to use vertical stride of zero to replicate the vec4 uniform, like 286 * 287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] 288 * 289 * But you can't, since vertical stride is always four in three-source 290 * instructions. Instead, insert a MOV instruction to do the replication so 291 * that the three-source instruction can consume it. 292 */ 293 294 /* The MOV is only needed if the source is a uniform or immediate. */ 295 if (src.file != UNIFORM && src.file != IMM) 296 return src; 297 298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) 299 return src; 300 301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type); 302 expanded.type = src.type; 303 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); 304 return src_reg(expanded); 305} 306 307src_reg 308vec4_visitor::resolve_source_modifiers(const src_reg &src) 309{ 310 if (!src.abs && !src.negate) 311 return src; 312 313 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type); 314 resolved.type = src.type; 315 emit(MOV(resolved, src)); 316 317 return src_reg(resolved); 318} 319 320src_reg 321vec4_visitor::fix_math_operand(const src_reg &src) 322{ 323 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE) 324 return src; 325 326 /* The gen6 math instruction ignores the source modifiers -- 327 * swizzle, abs, negate, and at least some parts of the register 328 * region description. 329 * 330 * Rather than trying to enumerate all these cases, *always* expand the 331 * operand to a temp GRF for gen6. 332 * 333 * For gen7, keep the operand as-is, except if immediate, which gen7 still 334 * can't use. 335 */ 336 337 if (devinfo->gen == 7 && src.file != IMM) 338 return src; 339 340 dst_reg expanded = dst_reg(this, glsl_type::vec4_type); 341 expanded.type = src.type; 342 emit(MOV(expanded, src)); 343 return src_reg(expanded); 344} 345 346vec4_instruction * 347vec4_visitor::emit_math(enum opcode opcode, 348 const dst_reg &dst, 349 const src_reg &src0, const src_reg &src1) 350{ 351 vec4_instruction *math = 352 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1)); 353 354 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) { 355 /* MATH on Gen6 must be align1, so we can't do writemasks. */ 356 math->dst = dst_reg(this, glsl_type::vec4_type); 357 math->dst.type = dst.type; 358 math = emit(MOV(dst, src_reg(math->dst))); 359 } else if (devinfo->gen < 6) { 360 math->base_mrf = 1; 361 math->mlen = src1.file == BAD_FILE ? 1 : 2; 362 } 363 364 return math; 365} 366 367void 368vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) 369{ 370 if (devinfo->gen < 7) { 371 unreachable("ir_unop_pack_half_2x16 should be lowered"); 372 } 373 374 assert(dst.type == BRW_REGISTER_TYPE_UD); 375 assert(src0.type == BRW_REGISTER_TYPE_F); 376 377 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: 378 * 379 * Because this instruction does not have a 16-bit floating-point type, 380 * the destination data type must be Word (W). 381 * 382 * The destination must be DWord-aligned and specify a horizontal stride 383 * (HorzStride) of 2. The 16-bit result is stored in the lower word of 384 * each destination channel and the upper word is not modified. 385 * 386 * The above restriction implies that the f32to16 instruction must use 387 * align1 mode, because only in align1 mode is it possible to specify 388 * horizontal stride. We choose here to defy the hardware docs and emit 389 * align16 instructions. 390 * 391 * (I [chadv] did attempt to emit align1 instructions for VS f32to16 392 * instructions. I was partially successful in that the code passed all 393 * tests. However, the code was dubiously correct and fragile, and the 394 * tests were not harsh enough to probe that frailty. Not trusting the 395 * code, I chose instead to remain in align16 mode in defiance of the hw 396 * docs). 397 * 398 * I've [chadv] experimentally confirmed that, on gen7 hardware and the 399 * simulator, emitting a f32to16 in align16 mode with UD as destination 400 * data type is safe. The behavior differs from that specified in the PRM 401 * in that the upper word of each destination channel is cleared to 0. 402 */ 403 404 dst_reg tmp_dst(this, glsl_type::uvec2_type); 405 src_reg tmp_src(tmp_dst); 406 407#if 0 408 /* Verify the undocumented behavior on which the following instructions 409 * rely. If f32to16 fails to clear the upper word of the X and Y channels, 410 * then the result of the bit-or instruction below will be incorrect. 411 * 412 * You should inspect the disasm output in order to verify that the MOV is 413 * not optimized away. 414 */ 415 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u))); 416#endif 417 418 /* Give tmp the form below, where "." means untouched. 419 * 420 * w z y x w z y x 421 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll| 422 * 423 * That the upper word of each write-channel be 0 is required for the 424 * following bit-shift and bit-or instructions to work. Note that this 425 * relies on the undocumented hardware behavior mentioned above. 426 */ 427 tmp_dst.writemask = WRITEMASK_XY; 428 emit(F32TO16(tmp_dst, src0)); 429 430 /* Give the write-channels of dst the form: 431 * 0xhhhh0000 432 */ 433 tmp_src.swizzle = BRW_SWIZZLE_YYYY; 434 emit(SHL(dst, tmp_src, brw_imm_ud(16u))); 435 436 /* Finally, give the write-channels of dst the form of packHalf2x16's 437 * output: 438 * 0xhhhhllll 439 */ 440 tmp_src.swizzle = BRW_SWIZZLE_XXXX; 441 emit(OR(dst, src_reg(dst), tmp_src)); 442} 443 444void 445vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) 446{ 447 if (devinfo->gen < 7) { 448 unreachable("ir_unop_unpack_half_2x16 should be lowered"); 449 } 450 451 assert(dst.type == BRW_REGISTER_TYPE_F); 452 assert(src0.type == BRW_REGISTER_TYPE_UD); 453 454 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: 455 * 456 * Because this instruction does not have a 16-bit floating-point type, 457 * the source data type must be Word (W). The destination type must be 458 * F (Float). 459 * 460 * To use W as the source data type, we must adjust horizontal strides, 461 * which is only possible in align1 mode. All my [chadv] attempts at 462 * emitting align1 instructions for unpackHalf2x16 failed to pass the 463 * Piglit tests, so I gave up. 464 * 465 * I've verified that, on gen7 hardware and the simulator, it is safe to 466 * emit f16to32 in align16 mode with UD as source data type. 467 */ 468 469 dst_reg tmp_dst(this, glsl_type::uvec2_type); 470 src_reg tmp_src(tmp_dst); 471 472 tmp_dst.writemask = WRITEMASK_X; 473 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu))); 474 475 tmp_dst.writemask = WRITEMASK_Y; 476 emit(SHR(tmp_dst, src0, brw_imm_ud(16u))); 477 478 dst.writemask = WRITEMASK_XY; 479 emit(F16TO32(dst, tmp_src)); 480} 481 482void 483vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0) 484{ 485 /* Instead of splitting the 32-bit integer, shifting, and ORing it back 486 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate 487 * is not suitable to generate the shift values, but we can use the packed 488 * vector float and a type-converting MOV. 489 */ 490 dst_reg shift(this, glsl_type::uvec4_type); 491 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); 492 493 dst_reg shifted(this, glsl_type::uvec4_type); 494 src0.swizzle = BRW_SWIZZLE_XXXX; 495 emit(SHR(shifted, src0, src_reg(shift))); 496 497 shifted.type = BRW_REGISTER_TYPE_UB; 498 dst_reg f(this, glsl_type::vec4_type); 499 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); 500 501 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f))); 502} 503 504void 505vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0) 506{ 507 /* Instead of splitting the 32-bit integer, shifting, and ORing it back 508 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate 509 * is not suitable to generate the shift values, but we can use the packed 510 * vector float and a type-converting MOV. 511 */ 512 dst_reg shift(this, glsl_type::uvec4_type); 513 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); 514 515 dst_reg shifted(this, glsl_type::uvec4_type); 516 src0.swizzle = BRW_SWIZZLE_XXXX; 517 emit(SHR(shifted, src0, src_reg(shift))); 518 519 shifted.type = BRW_REGISTER_TYPE_B; 520 dst_reg f(this, glsl_type::vec4_type); 521 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); 522 523 dst_reg scaled(this, glsl_type::vec4_type); 524 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f))); 525 526 dst_reg max(this, glsl_type::vec4_type); 527 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f)); 528 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f)); 529} 530 531void 532vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0) 533{ 534 dst_reg saturated(this, glsl_type::vec4_type); 535 vec4_instruction *inst = emit(MOV(saturated, src0)); 536 inst->saturate = true; 537 538 dst_reg scaled(this, glsl_type::vec4_type); 539 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f))); 540 541 dst_reg rounded(this, glsl_type::vec4_type); 542 emit(RNDE(rounded, src_reg(scaled))); 543 544 dst_reg u(this, glsl_type::uvec4_type); 545 emit(MOV(u, src_reg(rounded))); 546 547 src_reg bytes(u); 548 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); 549} 550 551void 552vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0) 553{ 554 dst_reg max(this, glsl_type::vec4_type); 555 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f)); 556 557 dst_reg min(this, glsl_type::vec4_type); 558 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f)); 559 560 dst_reg scaled(this, glsl_type::vec4_type); 561 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f))); 562 563 dst_reg rounded(this, glsl_type::vec4_type); 564 emit(RNDE(rounded, src_reg(scaled))); 565 566 dst_reg i(this, glsl_type::ivec4_type); 567 emit(MOV(i, src_reg(rounded))); 568 569 src_reg bytes(i); 570 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); 571} 572 573/* 574 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 == 575 * false) elements needed to pack a type. 576 */ 577static int 578type_size_xvec4(const struct glsl_type *type, bool as_vec4) 579{ 580 unsigned int i; 581 int size; 582 583 switch (type->base_type) { 584 case GLSL_TYPE_UINT: 585 case GLSL_TYPE_INT: 586 case GLSL_TYPE_FLOAT: 587 case GLSL_TYPE_BOOL: 588 case GLSL_TYPE_DOUBLE: 589 if (type->is_matrix()) { 590 const glsl_type *col_type = type->column_type(); 591 unsigned col_slots = 592 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1; 593 return type->matrix_columns * col_slots; 594 } else { 595 /* Regardless of size of vector, it gets a vec4. This is bad 596 * packing for things like floats, but otherwise arrays become a 597 * mess. Hopefully a later pass over the code can pack scalars 598 * down if appropriate. 599 */ 600 return (as_vec4 && type->is_dual_slot()) ? 2 : 1; 601 } 602 case GLSL_TYPE_ARRAY: 603 assert(type->length > 0); 604 return type_size_xvec4(type->fields.array, as_vec4) * type->length; 605 case GLSL_TYPE_STRUCT: 606 size = 0; 607 for (i = 0; i < type->length; i++) { 608 size += type_size_xvec4(type->fields.structure[i].type, as_vec4); 609 } 610 return size; 611 case GLSL_TYPE_SUBROUTINE: 612 return 1; 613 614 case GLSL_TYPE_SAMPLER: 615 /* Samplers take up no register space, since they're baked in at 616 * link time. 617 */ 618 return 0; 619 case GLSL_TYPE_ATOMIC_UINT: 620 return 0; 621 case GLSL_TYPE_IMAGE: 622 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4); 623 case GLSL_TYPE_VOID: 624 case GLSL_TYPE_ERROR: 625 case GLSL_TYPE_INTERFACE: 626 case GLSL_TYPE_FUNCTION: 627 unreachable("not reached"); 628 } 629 630 return 0; 631} 632 633/** 634 * Returns the minimum number of vec4 elements needed to pack a type. 635 * 636 * For simple types, it will return 1 (a single vec4); for matrices, the 637 * number of columns; for array and struct, the sum of the vec4_size of 638 * each of its elements; and for sampler and atomic, zero. 639 * 640 * This method is useful to calculate how much register space is needed to 641 * store a particular type. 642 */ 643extern "C" int 644type_size_vec4(const struct glsl_type *type) 645{ 646 return type_size_xvec4(type, true); 647} 648 649/** 650 * Returns the minimum number of dvec4 elements needed to pack a type. 651 * 652 * For simple types, it will return 1 (a single dvec4); for matrices, the 653 * number of columns; for array and struct, the sum of the dvec4_size of 654 * each of its elements; and for sampler and atomic, zero. 655 * 656 * This method is useful to calculate how much register space is needed to 657 * store a particular type. 658 * 659 * Measuring double-precision vertex inputs as dvec4 is required because 660 * ARB_vertex_attrib_64bit states that these uses the same number of locations 661 * than the single-precision version. That is, two consecutives dvec4 would be 662 * located in location "x" and location "x+1", not "x+2". 663 * 664 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs, 665 * remap_vs_attrs() will take in account both the location and also if the 666 * type fits in one or two vec4 slots. 667 */ 668extern "C" int 669type_size_dvec4(const struct glsl_type *type) 670{ 671 return type_size_xvec4(type, false); 672} 673 674src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type) 675{ 676 init(); 677 678 this->file = VGRF; 679 this->nr = v->alloc.allocate(type_size_vec4(type)); 680 681 if (type->is_array() || type->is_record()) { 682 this->swizzle = BRW_SWIZZLE_NOOP; 683 } else { 684 this->swizzle = brw_swizzle_for_size(type->vector_elements); 685 } 686 687 this->type = brw_type_for_base_type(type); 688} 689 690src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size) 691{ 692 assert(size > 0); 693 694 init(); 695 696 this->file = VGRF; 697 this->nr = v->alloc.allocate(type_size_vec4(type) * size); 698 699 this->swizzle = BRW_SWIZZLE_NOOP; 700 701 this->type = brw_type_for_base_type(type); 702} 703 704dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type) 705{ 706 init(); 707 708 this->file = VGRF; 709 this->nr = v->alloc.allocate(type_size_vec4(type)); 710 711 if (type->is_array() || type->is_record()) { 712 this->writemask = WRITEMASK_XYZW; 713 } else { 714 this->writemask = (1 << type->vector_elements) - 1; 715 } 716 717 this->type = brw_type_for_base_type(type); 718} 719 720vec4_instruction * 721vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst, 722 src_reg src0, src_reg src1) 723{ 724 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1); 725 inst->conditional_mod = conditionalmod; 726 return inst; 727} 728 729vec4_instruction * 730vec4_visitor::emit_lrp(const dst_reg &dst, 731 const src_reg &x, const src_reg &y, const src_reg &a) 732{ 733 if (devinfo->gen >= 6) { 734 /* Note that the instruction's argument order is reversed from GLSL 735 * and the IR. 736 */ 737 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y), 738 fix_3src_operand(x))); 739 } else { 740 /* Earlier generations don't support three source operations, so we 741 * need to emit x*(1-a) + y*a. 742 */ 743 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type); 744 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type); 745 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type); 746 y_times_a.writemask = dst.writemask; 747 one_minus_a.writemask = dst.writemask; 748 x_times_one_minus_a.writemask = dst.writemask; 749 750 emit(MUL(y_times_a, y, a)); 751 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f))); 752 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a))); 753 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a))); 754 } 755} 756 757/** 758 * Emits the instructions needed to perform a pull constant load. before_block 759 * and before_inst can be NULL in which case the instruction will be appended 760 * to the end of the instruction list. 761 */ 762void 763vec4_visitor::emit_pull_constant_load_reg(dst_reg dst, 764 src_reg surf_index, 765 src_reg offset_reg, 766 bblock_t *before_block, 767 vec4_instruction *before_inst) 768{ 769 assert((before_inst == NULL && before_block == NULL) || 770 (before_inst && before_block)); 771 772 vec4_instruction *pull; 773 774 if (devinfo->gen >= 9) { 775 /* Gen9+ needs a message header in order to use SIMD4x2 mode */ 776 src_reg header(this, glsl_type::uvec4_type, 2); 777 778 pull = new(mem_ctx) 779 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, 780 dst_reg(header)); 781 782 if (before_inst) 783 emit_before(before_block, before_inst, pull); 784 else 785 emit(pull); 786 787 dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE), 788 offset_reg.type); 789 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg); 790 791 if (before_inst) 792 emit_before(before_block, before_inst, pull); 793 else 794 emit(pull); 795 796 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, 797 dst, 798 surf_index, 799 header); 800 pull->mlen = 2; 801 pull->header_size = 1; 802 } else if (devinfo->gen >= 7) { 803 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type); 804 805 grf_offset.type = offset_reg.type; 806 807 pull = MOV(grf_offset, offset_reg); 808 809 if (before_inst) 810 emit_before(before_block, before_inst, pull); 811 else 812 emit(pull); 813 814 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, 815 dst, 816 surf_index, 817 src_reg(grf_offset)); 818 pull->mlen = 1; 819 } else { 820 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD, 821 dst, 822 surf_index, 823 offset_reg); 824 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1; 825 pull->mlen = 1; 826 } 827 828 if (before_inst) 829 emit_before(before_block, before_inst, pull); 830 else 831 emit(pull); 832} 833 834src_reg 835vec4_visitor::emit_uniformize(const src_reg &src) 836{ 837 const src_reg chan_index(this, glsl_type::uint_type); 838 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type), 839 src.type); 840 841 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index)) 842 ->force_writemask_all = true; 843 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index) 844 ->force_writemask_all = true; 845 846 return src_reg(dst); 847} 848 849src_reg 850vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type, 851 src_reg coordinate, src_reg surface) 852{ 853 vec4_instruction *inst = 854 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS, 855 dst_reg(this, glsl_type::uvec4_type)); 856 inst->base_mrf = 2; 857 inst->src[1] = surface; 858 inst->src[2] = surface; 859 860 int param_base; 861 862 if (devinfo->gen >= 9) { 863 /* Gen9+ needs a message header in order to use SIMD4x2 mode */ 864 vec4_instruction *header_inst = new(mem_ctx) 865 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, 866 dst_reg(MRF, inst->base_mrf)); 867 868 emit(header_inst); 869 870 inst->mlen = 2; 871 inst->header_size = 1; 872 param_base = inst->base_mrf + 1; 873 } else { 874 inst->mlen = 1; 875 param_base = inst->base_mrf; 876 } 877 878 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */ 879 int coord_mask = (1 << coordinate_type->vector_elements) - 1; 880 int zero_mask = 0xf & ~coord_mask; 881 882 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask), 883 coordinate)); 884 885 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask), 886 brw_imm_d(0))); 887 888 emit(inst); 889 return src_reg(inst->dst); 890} 891 892bool 893vec4_visitor::is_high_sampler(src_reg sampler) 894{ 895 if (devinfo->gen < 8 && !devinfo->is_haswell) 896 return false; 897 898 return sampler.file != IMM || sampler.ud >= 16; 899} 900 901void 902vec4_visitor::emit_texture(ir_texture_opcode op, 903 dst_reg dest, 904 const glsl_type *dest_type, 905 src_reg coordinate, 906 int coord_components, 907 src_reg shadow_comparator, 908 src_reg lod, src_reg lod2, 909 src_reg sample_index, 910 uint32_t constant_offset, 911 src_reg offset_value, 912 src_reg mcs, 913 uint32_t surface, 914 src_reg surface_reg, 915 src_reg sampler_reg) 916{ 917 /* The sampler can only meaningfully compute LOD for fragment shader 918 * messages. For all other stages, we change the opcode to TXL and hardcode 919 * the LOD to 0. 920 * 921 * textureQueryLevels() is implemented in terms of TXS so we need to pass a 922 * valid LOD argument. 923 */ 924 if (op == ir_tex || op == ir_query_levels) { 925 assert(lod.file == BAD_FILE); 926 lod = brw_imm_f(0.0f); 927 } 928 929 enum opcode opcode; 930 switch (op) { 931 case ir_tex: opcode = SHADER_OPCODE_TXL; break; 932 case ir_txl: opcode = SHADER_OPCODE_TXL; break; 933 case ir_txd: opcode = SHADER_OPCODE_TXD; break; 934 case ir_txf: opcode = SHADER_OPCODE_TXF; break; 935 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W : 936 SHADER_OPCODE_TXF_CMS); break; 937 case ir_txs: opcode = SHADER_OPCODE_TXS; break; 938 case ir_tg4: opcode = offset_value.file != BAD_FILE 939 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break; 940 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break; 941 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break; 942 case ir_txb: 943 unreachable("TXB is not valid for vertex shaders."); 944 case ir_lod: 945 unreachable("LOD is not valid for vertex shaders."); 946 case ir_samples_identical: { 947 /* There are some challenges implementing this for vec4, and it seems 948 * unlikely to be used anyway. For now, just return false ways. 949 */ 950 emit(MOV(dest, brw_imm_ud(0u))); 951 return; 952 } 953 default: 954 unreachable("Unrecognized tex op"); 955 } 956 957 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest); 958 959 inst->offset = constant_offset; 960 961 /* The message header is necessary for: 962 * - Gen4 (always) 963 * - Gen9+ for selecting SIMD4x2 964 * - Texel offsets 965 * - Gather channel selection 966 * - Sampler indices too large to fit in a 4-bit value. 967 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal 968 */ 969 inst->header_size = 970 (devinfo->gen < 5 || devinfo->gen >= 9 || 971 inst->offset != 0 || op == ir_tg4 || 972 op == ir_texture_samples || 973 is_high_sampler(sampler_reg)) ? 1 : 0; 974 inst->base_mrf = 2; 975 inst->mlen = inst->header_size; 976 inst->dst.writemask = WRITEMASK_XYZW; 977 inst->shadow_compare = shadow_comparator.file != BAD_FILE; 978 979 inst->src[1] = surface_reg; 980 inst->src[2] = sampler_reg; 981 982 /* MRF for the first parameter */ 983 int param_base = inst->base_mrf + inst->header_size; 984 985 if (op == ir_txs || op == ir_query_levels) { 986 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X; 987 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod)); 988 inst->mlen++; 989 } else if (op == ir_texture_samples) { 990 inst->dst.writemask = WRITEMASK_X; 991 } else { 992 /* Load the coordinate */ 993 /* FINISHME: gl_clamp_mask and saturate */ 994 int coord_mask = (1 << coord_components) - 1; 995 int zero_mask = 0xf & ~coord_mask; 996 997 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask), 998 coordinate)); 999 inst->mlen++; 1000 1001 if (zero_mask != 0) { 1002 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask), 1003 brw_imm_d(0))); 1004 } 1005 /* Load the shadow comparator */ 1006 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) { 1007 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type, 1008 WRITEMASK_X), 1009 shadow_comparator)); 1010 inst->mlen++; 1011 } 1012 1013 /* Load the LOD info */ 1014 if (op == ir_tex || op == ir_txl) { 1015 int mrf, writemask; 1016 if (devinfo->gen >= 5) { 1017 mrf = param_base + 1; 1018 if (shadow_comparator.file != BAD_FILE) { 1019 writemask = WRITEMASK_Y; 1020 /* mlen already incremented */ 1021 } else { 1022 writemask = WRITEMASK_X; 1023 inst->mlen++; 1024 } 1025 } else /* devinfo->gen == 4 */ { 1026 mrf = param_base; 1027 writemask = WRITEMASK_W; 1028 } 1029 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod)); 1030 } else if (op == ir_txf) { 1031 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod)); 1032 } else if (op == ir_txf_ms) { 1033 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X), 1034 sample_index)); 1035 if (opcode == SHADER_OPCODE_TXF_CMS_W) { 1036 /* MCS data is stored in the first two channels of ‘mcs’, but we 1037 * need to get it into the .y and .z channels of the second vec4 1038 * of params. 1039 */ 1040 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1); 1041 emit(MOV(dst_reg(MRF, param_base + 1, 1042 glsl_type::uint_type, WRITEMASK_YZ), 1043 mcs)); 1044 } else if (devinfo->gen >= 7) { 1045 /* MCS data is in the first channel of `mcs`, but we need to get it into 1046 * the .y channel of the second vec4 of params, so replicate .x across 1047 * the whole vec4 and then mask off everything except .y 1048 */ 1049 mcs.swizzle = BRW_SWIZZLE_XXXX; 1050 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y), 1051 mcs)); 1052 } 1053 inst->mlen++; 1054 } else if (op == ir_txd) { 1055 const brw_reg_type type = lod.type; 1056 1057 if (devinfo->gen >= 5) { 1058 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); 1059 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); 1060 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod)); 1061 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2)); 1062 inst->mlen++; 1063 1064 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) { 1065 lod.swizzle = BRW_SWIZZLE_ZZZZ; 1066 lod2.swizzle = BRW_SWIZZLE_ZZZZ; 1067 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod)); 1068 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2)); 1069 inst->mlen++; 1070 1071 if (shadow_comparator.file != BAD_FILE) { 1072 emit(MOV(dst_reg(MRF, param_base + 2, 1073 shadow_comparator.type, WRITEMASK_Z), 1074 shadow_comparator)); 1075 } 1076 } 1077 } else /* devinfo->gen == 4 */ { 1078 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod)); 1079 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2)); 1080 inst->mlen += 2; 1081 } 1082 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) { 1083 if (shadow_comparator.file != BAD_FILE) { 1084 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W), 1085 shadow_comparator)); 1086 } 1087 1088 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY), 1089 offset_value)); 1090 inst->mlen++; 1091 } 1092 } 1093 1094 emit(inst); 1095 1096 /* fixup num layers (z) for cube arrays: hardware returns faces * layers; 1097 * spec requires layers. 1098 */ 1099 if (op == ir_txs && devinfo->gen < 7) { 1100 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ 1101 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z), 1102 src_reg(inst->dst), brw_imm_d(1)); 1103 } 1104 1105 if (devinfo->gen == 6 && op == ir_tg4) { 1106 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst); 1107 } 1108 1109 if (op == ir_query_levels) { 1110 /* # levels is in .w */ 1111 src_reg swizzled(dest); 1112 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, 1113 SWIZZLE_W, SWIZZLE_W); 1114 emit(MOV(dest, swizzled)); 1115 } 1116} 1117 1118/** 1119 * Apply workarounds for Gen6 gather with UINT/SINT 1120 */ 1121void 1122vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst) 1123{ 1124 if (!wa) 1125 return; 1126 1127 int width = (wa & WA_8BIT) ? 8 : 16; 1128 dst_reg dst_f = dst; 1129 dst_f.type = BRW_REGISTER_TYPE_F; 1130 1131 /* Convert from UNORM to UINT */ 1132 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1)))); 1133 emit(MOV(dst, src_reg(dst_f))); 1134 1135 if (wa & WA_SIGN) { 1136 /* Reinterpret the UINT value as a signed INT value by 1137 * shifting the sign bit into place, then shifting back 1138 * preserving sign. 1139 */ 1140 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width))); 1141 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width))); 1142 } 1143} 1144 1145void 1146vec4_visitor::gs_emit_vertex(int /* stream_id */) 1147{ 1148 unreachable("not reached"); 1149} 1150 1151void 1152vec4_visitor::gs_end_primitive() 1153{ 1154 unreachable("not reached"); 1155} 1156 1157void 1158vec4_visitor::emit_ndc_computation() 1159{ 1160 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE) 1161 return; 1162 1163 /* Get the position */ 1164 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]); 1165 1166 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */ 1167 dst_reg ndc = dst_reg(this, glsl_type::vec4_type); 1168 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc; 1169 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4; 1170 1171 current_annotation = "NDC"; 1172 dst_reg ndc_w = ndc; 1173 ndc_w.writemask = WRITEMASK_W; 1174 src_reg pos_w = pos; 1175 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); 1176 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w); 1177 1178 dst_reg ndc_xyz = ndc; 1179 ndc_xyz.writemask = WRITEMASK_XYZ; 1180 1181 emit(MUL(ndc_xyz, pos, src_reg(ndc_w))); 1182} 1183 1184void 1185vec4_visitor::emit_psiz_and_flags(dst_reg reg) 1186{ 1187 if (devinfo->gen < 6 && 1188 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) || 1189 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE || 1190 devinfo->has_negative_rhw_bug)) { 1191 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type); 1192 dst_reg header1_w = header1; 1193 header1_w.writemask = WRITEMASK_W; 1194 1195 emit(MOV(header1, brw_imm_ud(0u))); 1196 1197 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { 1198 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); 1199 1200 current_annotation = "Point size"; 1201 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11)))); 1202 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8))); 1203 } 1204 1205 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) { 1206 current_annotation = "Clipping flags"; 1207 dst_reg flags0 = dst_reg(this, glsl_type::uint_type); 1208 dst_reg flags1 = dst_reg(this, glsl_type::uint_type); 1209 1210 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1211 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0)); 1212 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0))); 1213 1214 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1215 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0)); 1216 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4))); 1217 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1))); 1218 } 1219 1220 /* i965 clipping workaround: 1221 * 1) Test for -ve rhw 1222 * 2) If set, 1223 * set ndc = (0,0,0,0) 1224 * set ucp[6] = 1 1225 * 1226 * Later, clipping will detect ucp[6] and ensure the primitive is 1227 * clipped against all fixed planes. 1228 */ 1229 if (devinfo->has_negative_rhw_bug && 1230 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) { 1231 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]); 1232 ndc_w.swizzle = BRW_SWIZZLE_WWWW; 1233 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1234 vec4_instruction *inst; 1235 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6))); 1236 inst->predicate = BRW_PREDICATE_NORMAL; 1237 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F; 1238 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f))); 1239 inst->predicate = BRW_PREDICATE_NORMAL; 1240 } 1241 1242 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1))); 1243 } else if (devinfo->gen < 6) { 1244 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u))); 1245 } else { 1246 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0))); 1247 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { 1248 dst_reg reg_w = reg; 1249 reg_w.writemask = WRITEMASK_W; 1250 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); 1251 reg_as_src.type = reg_w.type; 1252 reg_as_src.swizzle = brw_swizzle_for_size(1); 1253 emit(MOV(reg_w, reg_as_src)); 1254 } 1255 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) { 1256 dst_reg reg_y = reg; 1257 reg_y.writemask = WRITEMASK_Y; 1258 reg_y.type = BRW_REGISTER_TYPE_D; 1259 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type; 1260 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0]))); 1261 } 1262 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) { 1263 dst_reg reg_z = reg; 1264 reg_z.writemask = WRITEMASK_Z; 1265 reg_z.type = BRW_REGISTER_TYPE_D; 1266 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type; 1267 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0]))); 1268 } 1269 } 1270} 1271 1272vec4_instruction * 1273vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component) 1274{ 1275 assert(varying < VARYING_SLOT_MAX); 1276 1277 unsigned num_comps = output_num_components[varying][component]; 1278 if (num_comps == 0) 1279 return NULL; 1280 1281 assert(output_reg[varying][component].type == reg.type); 1282 current_annotation = output_reg_annotation[varying]; 1283 if (output_reg[varying][component].file != BAD_FILE) { 1284 src_reg src = src_reg(output_reg[varying][component]); 1285 src.swizzle = BRW_SWZ_COMP_OUTPUT(component); 1286 reg.writemask = 1287 brw_writemask_for_component_packing(num_comps, component); 1288 return emit(MOV(reg, src)); 1289 } 1290 return NULL; 1291} 1292 1293void 1294vec4_visitor::emit_urb_slot(dst_reg reg, int varying) 1295{ 1296 reg.type = BRW_REGISTER_TYPE_F; 1297 output_reg[varying][0].type = reg.type; 1298 1299 switch (varying) { 1300 case VARYING_SLOT_PSIZ: 1301 { 1302 /* PSIZ is always in slot 0, and is coupled with other flags. */ 1303 current_annotation = "indices, point width, clip flags"; 1304 emit_psiz_and_flags(reg); 1305 break; 1306 } 1307 case BRW_VARYING_SLOT_NDC: 1308 current_annotation = "NDC"; 1309 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) 1310 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]))); 1311 break; 1312 case VARYING_SLOT_POS: 1313 current_annotation = "gl_Position"; 1314 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE) 1315 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0]))); 1316 break; 1317 case VARYING_SLOT_EDGE: 1318 /* This is present when doing unfilled polygons. We're supposed to copy 1319 * the edge flag from the user-provided vertex array 1320 * (glEdgeFlagPointer), or otherwise we'll copy from the current value 1321 * of that attribute (starts as 1.0f). This is then used in clipping to 1322 * determine which edges should be drawn as wireframe. 1323 */ 1324 current_annotation = "edge flag"; 1325 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG, 1326 glsl_type::float_type, WRITEMASK_XYZW)))); 1327 break; 1328 case BRW_VARYING_SLOT_PAD: 1329 /* No need to write to this slot */ 1330 break; 1331 default: 1332 for (int i = 0; i < 4; i++) { 1333 emit_generic_urb_slot(reg, varying, i); 1334 } 1335 break; 1336 } 1337} 1338 1339static int 1340align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen) 1341{ 1342 if (devinfo->gen >= 6) { 1343 /* URB data written (does not include the message header reg) must 1344 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, 1345 * section 5.4.3.2.2: URB_INTERLEAVED. 1346 * 1347 * URB entries are allocated on a multiple of 1024 bits, so an 1348 * extra 128 bits written here to make the end align to 256 is 1349 * no problem. 1350 */ 1351 if ((mlen % 2) != 1) 1352 mlen++; 1353 } 1354 1355 return mlen; 1356} 1357 1358 1359/** 1360 * Generates the VUE payload plus the necessary URB write instructions to 1361 * output it. 1362 * 1363 * The VUE layout is documented in Volume 2a. 1364 */ 1365void 1366vec4_visitor::emit_vertex() 1367{ 1368 /* MRF 0 is reserved for the debugger, so start with message header 1369 * in MRF 1. 1370 */ 1371 int base_mrf = 1; 1372 int mrf = base_mrf; 1373 /* In the process of generating our URB write message contents, we 1374 * may need to unspill a register or load from an array. Those 1375 * reads would use MRFs 14-15. 1376 */ 1377 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen); 1378 1379 /* The following assertion verifies that max_usable_mrf causes an 1380 * even-numbered amount of URB write data, which will meet gen6's 1381 * requirements for length alignment. 1382 */ 1383 assert ((max_usable_mrf - base_mrf) % 2 == 0); 1384 1385 /* First mrf is the g0-based message header containing URB handles and 1386 * such. 1387 */ 1388 emit_urb_write_header(mrf++); 1389 1390 if (devinfo->gen < 6) { 1391 emit_ndc_computation(); 1392 } 1393 1394 /* We may need to split this up into several URB writes, so do them in a 1395 * loop. 1396 */ 1397 int slot = 0; 1398 bool complete = false; 1399 do { 1400 /* URB offset is in URB row increments, and each of our MRFs is half of 1401 * one of those, since we're doing interleaved writes. 1402 */ 1403 int offset = slot / 2; 1404 1405 mrf = base_mrf + 1; 1406 for (; slot < prog_data->vue_map.num_slots; ++slot) { 1407 emit_urb_slot(dst_reg(MRF, mrf++), 1408 prog_data->vue_map.slot_to_varying[slot]); 1409 1410 /* If this was max_usable_mrf, we can't fit anything more into this 1411 * URB WRITE. Same thing if we reached the maximum length available. 1412 */ 1413 if (mrf > max_usable_mrf || 1414 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { 1415 slot++; 1416 break; 1417 } 1418 } 1419 1420 complete = slot >= prog_data->vue_map.num_slots; 1421 current_annotation = "URB write"; 1422 vec4_instruction *inst = emit_urb_write_opcode(complete); 1423 inst->base_mrf = base_mrf; 1424 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf); 1425 inst->offset += offset; 1426 } while(!complete); 1427} 1428 1429 1430src_reg 1431vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst, 1432 src_reg *reladdr, int reg_offset) 1433{ 1434 /* Because we store the values to scratch interleaved like our 1435 * vertex data, we need to scale the vec4 index by 2. 1436 */ 1437 int message_header_scale = 2; 1438 1439 /* Pre-gen6, the message header uses byte offsets instead of vec4 1440 * (16-byte) offset units. 1441 */ 1442 if (devinfo->gen < 6) 1443 message_header_scale *= 16; 1444 1445 if (reladdr) { 1446 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have 1447 * to multiply the reladdr by 2. Notice that the reg_offset part 1448 * is in units of 16 bytes and is used to select the low/high 16-byte 1449 * chunk of a full dvec4, so we don't want to multiply that part. 1450 */ 1451 src_reg index = src_reg(this, glsl_type::int_type); 1452 if (type_sz(inst->dst.type) < 8) { 1453 emit_before(block, inst, ADD(dst_reg(index), *reladdr, 1454 brw_imm_d(reg_offset))); 1455 emit_before(block, inst, MUL(dst_reg(index), index, 1456 brw_imm_d(message_header_scale))); 1457 } else { 1458 emit_before(block, inst, MUL(dst_reg(index), *reladdr, 1459 brw_imm_d(message_header_scale * 2))); 1460 emit_before(block, inst, ADD(dst_reg(index), index, 1461 brw_imm_d(reg_offset * message_header_scale))); 1462 } 1463 return index; 1464 } else { 1465 return brw_imm_d(reg_offset * message_header_scale); 1466 } 1467} 1468 1469/** 1470 * Emits an instruction before @inst to load the value named by @orig_src 1471 * from scratch space at @base_offset to @temp. 1472 * 1473 * @base_offset is measured in 32-byte units (the size of a register). 1474 */ 1475void 1476vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst, 1477 dst_reg temp, src_reg orig_src, 1478 int base_offset) 1479{ 1480 assert(orig_src.offset % REG_SIZE == 0); 1481 int reg_offset = base_offset + orig_src.offset / REG_SIZE; 1482 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr, 1483 reg_offset); 1484 1485 if (type_sz(orig_src.type) < 8) { 1486 emit_before(block, inst, SCRATCH_READ(temp, index)); 1487 } else { 1488 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type); 1489 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F); 1490 emit_before(block, inst, SCRATCH_READ(shuffled_float, index)); 1491 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1); 1492 vec4_instruction *last_read = 1493 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index); 1494 emit_before(block, inst, last_read); 1495 shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read); 1496 } 1497} 1498 1499/** 1500 * Emits an instruction after @inst to store the value to be written 1501 * to @orig_dst to scratch space at @base_offset, from @temp. 1502 * 1503 * @base_offset is measured in 32-byte units (the size of a register). 1504 */ 1505void 1506vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst, 1507 int base_offset) 1508{ 1509 assert(inst->dst.offset % REG_SIZE == 0); 1510 int reg_offset = base_offset + inst->dst.offset / REG_SIZE; 1511 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, 1512 reg_offset); 1513 1514 /* Create a temporary register to store *inst's result in. 1515 * 1516 * We have to be careful in MOVing from our temporary result register in 1517 * the scratch write. If we swizzle from channels of the temporary that 1518 * weren't initialized, it will confuse live interval analysis, which will 1519 * make spilling fail to make progress. 1520 */ 1521 bool is_64bit = type_sz(inst->dst.type) == 8; 1522 const glsl_type *alloc_type = 1523 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type; 1524 const src_reg temp = swizzle(retype(src_reg(this, alloc_type), 1525 inst->dst.type), 1526 brw_swizzle_for_mask(inst->dst.writemask)); 1527 1528 if (!is_64bit) { 1529 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), 1530 inst->dst.writemask)); 1531 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index); 1532 if (inst->opcode != BRW_OPCODE_SEL) 1533 write->predicate = inst->predicate; 1534 write->ir = inst->ir; 1535 write->annotation = inst->annotation; 1536 inst->insert_after(block, write); 1537 } else { 1538 dst_reg shuffled = dst_reg(this, alloc_type); 1539 vec4_instruction *last = 1540 shuffle_64bit_data(shuffled, temp, true, block, inst); 1541 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F)); 1542 1543 uint8_t mask = 0; 1544 if (inst->dst.writemask & WRITEMASK_X) 1545 mask |= WRITEMASK_XY; 1546 if (inst->dst.writemask & WRITEMASK_Y) 1547 mask |= WRITEMASK_ZW; 1548 if (mask) { 1549 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); 1550 1551 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index); 1552 if (inst->opcode != BRW_OPCODE_SEL) 1553 write->predicate = inst->predicate; 1554 write->ir = inst->ir; 1555 write->annotation = inst->annotation; 1556 last->insert_after(block, write); 1557 } 1558 1559 mask = 0; 1560 if (inst->dst.writemask & WRITEMASK_Z) 1561 mask |= WRITEMASK_XY; 1562 if (inst->dst.writemask & WRITEMASK_W) 1563 mask |= WRITEMASK_ZW; 1564 if (mask) { 1565 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); 1566 1567 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, 1568 reg_offset + 1); 1569 vec4_instruction *write = 1570 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index); 1571 if (inst->opcode != BRW_OPCODE_SEL) 1572 write->predicate = inst->predicate; 1573 write->ir = inst->ir; 1574 write->annotation = inst->annotation; 1575 last->insert_after(block, write); 1576 } 1577 } 1578 1579 inst->dst.file = temp.file; 1580 inst->dst.nr = temp.nr; 1581 inst->dst.offset %= REG_SIZE; 1582 inst->dst.reladdr = NULL; 1583} 1584 1585/** 1586 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so, 1587 * adds the scratch read(s) before \p inst. The function also checks for 1588 * recursive reladdr scratch accesses, issuing the corresponding scratch 1589 * loads and rewriting reladdr references accordingly. 1590 * 1591 * \return \p src if it did not require a scratch load, otherwise, the 1592 * register holding the result of the scratch load that the caller should 1593 * use to rewrite src. 1594 */ 1595src_reg 1596vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block, 1597 vec4_instruction *inst, src_reg src) 1598{ 1599 /* Resolve recursive reladdr scratch access by calling ourselves 1600 * with src.reladdr 1601 */ 1602 if (src.reladdr) 1603 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, 1604 *src.reladdr); 1605 1606 /* Now handle scratch access on src */ 1607 if (src.file == VGRF && scratch_loc[src.nr] != -1) { 1608 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ? 1609 glsl_type::dvec4_type : glsl_type::vec4_type); 1610 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]); 1611 src.nr = temp.nr; 1612 src.offset %= REG_SIZE; 1613 src.reladdr = NULL; 1614 } 1615 1616 return src; 1617} 1618 1619/** 1620 * We can't generally support array access in GRF space, because a 1621 * single instruction's destination can only span 2 contiguous 1622 * registers. So, we send all GRF arrays that get variable index 1623 * access to scratch space. 1624 */ 1625void 1626vec4_visitor::move_grf_array_access_to_scratch() 1627{ 1628 int scratch_loc[this->alloc.count]; 1629 memset(scratch_loc, -1, sizeof(scratch_loc)); 1630 1631 /* First, calculate the set of virtual GRFs that need to be punted 1632 * to scratch due to having any array access on them, and where in 1633 * scratch. 1634 */ 1635 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1636 if (inst->dst.file == VGRF && inst->dst.reladdr) { 1637 if (scratch_loc[inst->dst.nr] == -1) { 1638 scratch_loc[inst->dst.nr] = last_scratch; 1639 last_scratch += this->alloc.sizes[inst->dst.nr]; 1640 } 1641 1642 for (src_reg *iter = inst->dst.reladdr; 1643 iter->reladdr; 1644 iter = iter->reladdr) { 1645 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { 1646 scratch_loc[iter->nr] = last_scratch; 1647 last_scratch += this->alloc.sizes[iter->nr]; 1648 } 1649 } 1650 } 1651 1652 for (int i = 0 ; i < 3; i++) { 1653 for (src_reg *iter = &inst->src[i]; 1654 iter->reladdr; 1655 iter = iter->reladdr) { 1656 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { 1657 scratch_loc[iter->nr] = last_scratch; 1658 last_scratch += this->alloc.sizes[iter->nr]; 1659 } 1660 } 1661 } 1662 } 1663 1664 /* Now, for anything that will be accessed through scratch, rewrite 1665 * it to load/store. Note that this is a _safe list walk, because 1666 * we may generate a new scratch_write instruction after the one 1667 * we're processing. 1668 */ 1669 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1670 /* Set up the annotation tracking for new generated instructions. */ 1671 base_ir = inst->ir; 1672 current_annotation = inst->annotation; 1673 1674 /* First handle scratch access on the dst. Notice we have to handle 1675 * the case where the dst's reladdr also points to scratch space. 1676 */ 1677 if (inst->dst.reladdr) 1678 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, 1679 *inst->dst.reladdr); 1680 1681 /* Now that we have handled any (possibly recursive) reladdr scratch 1682 * accesses for dst we can safely do the scratch write for dst itself 1683 */ 1684 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1) 1685 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]); 1686 1687 /* Now handle scratch access on any src. In this case, since inst->src[i] 1688 * already is a src_reg, we can just call emit_resolve_reladdr with 1689 * inst->src[i] and it will take care of handling scratch loads for 1690 * both src and src.reladdr (recursively). 1691 */ 1692 for (int i = 0 ; i < 3; i++) { 1693 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst, 1694 inst->src[i]); 1695 } 1696 } 1697} 1698 1699/** 1700 * Emits an instruction before @inst to load the value named by @orig_src 1701 * from the pull constant buffer (surface) at @base_offset to @temp. 1702 */ 1703void 1704vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, 1705 dst_reg temp, src_reg orig_src, 1706 int base_offset, src_reg indirect) 1707{ 1708 assert(orig_src.offset % 16 == 0); 1709 const unsigned index = prog_data->base.binding_table.pull_constants_start; 1710 1711 /* For 64bit loads we need to emit two 32-bit load messages and we also 1712 * we need to shuffle the 32-bit data result into proper 64-bit data. To do 1713 * that we emit the 32-bit loads into a temporary and we shuffle the result 1714 * into the original destination. 1715 */ 1716 dst_reg orig_temp = temp; 1717 bool is_64bit = type_sz(orig_src.type) == 8; 1718 if (is_64bit) { 1719 assert(type_sz(temp.type) == 8); 1720 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type); 1721 temp = retype(temp_df, BRW_REGISTER_TYPE_F); 1722 } 1723 1724 src_reg src = orig_src; 1725 for (int i = 0; i < (is_64bit ? 2 : 1); i++) { 1726 int reg_offset = base_offset + src.offset / 16; 1727 1728 src_reg offset; 1729 if (indirect.file != BAD_FILE) { 1730 offset = src_reg(this, glsl_type::uint_type); 1731 emit_before(block, inst, ADD(dst_reg(offset), indirect, 1732 brw_imm_ud(reg_offset * 16))); 1733 } else if (devinfo->gen >= 8) { 1734 /* Store the offset in a GRF so we can send-from-GRF. */ 1735 offset = src_reg(this, glsl_type::uint_type); 1736 emit_before(block, inst, MOV(dst_reg(offset), 1737 brw_imm_ud(reg_offset * 16))); 1738 } else { 1739 offset = brw_imm_d(reg_offset * 16); 1740 } 1741 1742 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE), 1743 brw_imm_ud(index), 1744 offset, 1745 block, inst); 1746 1747 src = byte_offset(src, 16); 1748 } 1749 1750 brw_mark_surface_used(&prog_data->base, index); 1751 1752 if (is_64bit) { 1753 temp = retype(temp, BRW_REGISTER_TYPE_DF); 1754 shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst); 1755 } 1756} 1757 1758/** 1759 * Implements array access of uniforms by inserting a 1760 * PULL_CONSTANT_LOAD instruction. 1761 * 1762 * Unlike temporary GRF array access (where we don't support it due to 1763 * the difficulty of doing relative addressing on instruction 1764 * destinations), we could potentially do array access of uniforms 1765 * that were loaded in GRF space as push constants. In real-world 1766 * usage we've seen, though, the arrays being used are always larger 1767 * than we could load as push constants, so just always move all 1768 * uniform array access out to a pull constant buffer. 1769 */ 1770void 1771vec4_visitor::move_uniform_array_access_to_pull_constants() 1772{ 1773 /* The vulkan dirver doesn't support pull constants other than UBOs so 1774 * everything has to be pushed regardless. 1775 */ 1776 if (stage_prog_data->pull_param == NULL) { 1777 split_uniform_registers(); 1778 return; 1779 } 1780 1781 int pull_constant_loc[this->uniforms]; 1782 memset(pull_constant_loc, -1, sizeof(pull_constant_loc)); 1783 1784 /* First, walk through the instructions and determine which things need to 1785 * be pulled. We mark something as needing to be pulled by setting 1786 * pull_constant_loc to 0. 1787 */ 1788 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1789 /* We only care about MOV_INDIRECT of a uniform */ 1790 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || 1791 inst->src[0].file != UNIFORM) 1792 continue; 1793 1794 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16; 1795 1796 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++) 1797 pull_constant_loc[uniform_nr + j] = 0; 1798 } 1799 1800 /* Next, we walk the list of uniforms and assign real pull constant 1801 * locations and set their corresponding entries in pull_param. 1802 */ 1803 for (int j = 0; j < this->uniforms; j++) { 1804 if (pull_constant_loc[j] < 0) 1805 continue; 1806 1807 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4; 1808 1809 for (int i = 0; i < 4; i++) { 1810 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] 1811 = stage_prog_data->param[j * 4 + i]; 1812 } 1813 } 1814 1815 /* Finally, we can walk through the instructions and lower MOV_INDIRECT 1816 * instructions to actual uniform pulls. 1817 */ 1818 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1819 /* We only care about MOV_INDIRECT of a uniform */ 1820 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || 1821 inst->src[0].file != UNIFORM) 1822 continue; 1823 1824 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16; 1825 1826 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP); 1827 1828 emit_pull_constant_load(block, inst, inst->dst, inst->src[0], 1829 pull_constant_loc[uniform_nr], inst->src[1]); 1830 inst->remove(block); 1831 } 1832 1833 /* Now there are no accesses of the UNIFORM file with a reladdr, so 1834 * no need to track them as larger-than-vec4 objects. This will be 1835 * relied on in cutting out unused uniform vectors from push 1836 * constants. 1837 */ 1838 split_uniform_registers(); 1839} 1840 1841void 1842vec4_visitor::resolve_ud_negate(src_reg *reg) 1843{ 1844 if (reg->type != BRW_REGISTER_TYPE_UD || 1845 !reg->negate) 1846 return; 1847 1848 src_reg temp = src_reg(this, glsl_type::uvec4_type); 1849 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg); 1850 *reg = temp; 1851} 1852 1853vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, 1854 void *log_data, 1855 const struct brw_sampler_prog_key_data *key_tex, 1856 struct brw_vue_prog_data *prog_data, 1857 const nir_shader *shader, 1858 void *mem_ctx, 1859 bool no_spills, 1860 int shader_time_index) 1861 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base), 1862 key_tex(key_tex), 1863 prog_data(prog_data), 1864 fail_msg(NULL), 1865 first_non_payload_grf(0), 1866 need_all_constants_in_pull_buffer(false), 1867 no_spills(no_spills), 1868 shader_time_index(shader_time_index), 1869 last_scratch(0) 1870{ 1871 this->failed = false; 1872 1873 this->base_ir = NULL; 1874 this->current_annotation = NULL; 1875 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation)); 1876 1877 memset(this->output_num_components, 0, sizeof(this->output_num_components)); 1878 1879 this->virtual_grf_start = NULL; 1880 this->virtual_grf_end = NULL; 1881 this->live_intervals = NULL; 1882 1883 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; 1884 1885 this->uniforms = 0; 1886} 1887 1888vec4_visitor::~vec4_visitor() 1889{ 1890} 1891 1892 1893void 1894vec4_visitor::fail(const char *format, ...) 1895{ 1896 va_list va; 1897 char *msg; 1898 1899 if (failed) 1900 return; 1901 1902 failed = true; 1903 1904 va_start(va, format); 1905 msg = ralloc_vasprintf(mem_ctx, format, va); 1906 va_end(va); 1907 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); 1908 1909 this->fail_msg = msg; 1910 1911 if (debug_enabled) { 1912 fprintf(stderr, "%s", msg); 1913 } 1914} 1915 1916} /* namespace brw */ 1917