1/* 2 * Copyright 2011 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "nv50/codegen/nv50_ir.h" 24#include "nv50/codegen/nv50_ir_build_util.h" 25 26#include "nv50_ir_target_nv50.h" 27 28namespace nv50_ir { 29 30// nv50 doesn't support 32 bit integer multiplication 31// 32// ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl) 33// ------------------- 34// al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) + 35// ah*bh 00 00 ( carry1) << 16 + ( carry2) 36// al*bl 37// ah*bl 00 38// 39// fffe0001 + fffe0001 40static bool 41expandIntegerMUL(BuildUtil *bld, Instruction *mul) 42{ 43 const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH; 44 45 DataType fTy = mul->sType; // full type 46 DataType hTy; 47 switch (fTy) { 48 case TYPE_S32: hTy = TYPE_S16; break; 49 case TYPE_U32: hTy = TYPE_U16; break; 50 case TYPE_U64: hTy = TYPE_U32; break; 51 case TYPE_S64: hTy = TYPE_S32; break; 52 default: 53 return false; 54 } 55 unsigned int fullSize = typeSizeof(fTy); 56 unsigned int halfSize = typeSizeof(hTy); 57 58 Instruction *i[9]; 59 60 bld->setPosition(mul, true); 61 62 Value *a[2], *b[2]; 63 Value *c[2]; 64 Value *t[4]; 65 for (int j = 0; j < 4; ++j) 66 t[j] = bld->getSSA(fullSize); 67 68 // split sources into halves 69 i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0)); 70 i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1)); 71 72 i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]); 73 i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]); 74 i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8)); 75 i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]); 76 77 if (highResult) { 78 Value *r[3]; 79 Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8)); 80 c[0] = bld->getSSA(1, FILE_FLAGS); 81 c[1] = bld->getSSA(1, FILE_FLAGS); 82 for (int j = 0; j < 3; ++j) 83 r[j] = bld->getSSA(fullSize); 84 85 i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8)); 86 i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm); 87 bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]); 88 i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]); 89 90 // set carry defs / sources 91 i[3]->setFlagsDef(1, c[0]); 92 i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry 93 i[6]->setPredicate(CC_C, c[0]); 94 i[5]->setFlagsSrc(3, c[1]); 95 } else { 96 bld->mkMov(mul->getDef(0), t[3]); 97 } 98 delete_Instruction(bld->getProgram(), mul); 99 100 for (int j = 2; j <= (highResult ? 5 : 4); ++j) 101 if (i[j]) 102 i[j]->sType = hTy; 103 104 return true; 105} 106 107#define QOP_ADD 0 108#define QOP_SUBR 1 109#define QOP_SUB 2 110#define QOP_MOV2 3 111 112// UL UR LL LR 113#define QUADOP(q, r, s, t) \ 114 ((QOP_##q << 6) | (QOP_##r << 4) | \ 115 (QOP_##s << 2) | (QOP_##t << 0)) 116 117class NV50LegalizePostRA : public Pass 118{ 119private: 120 virtual bool visit(Function *); 121 virtual bool visit(BasicBlock *); 122 123 void handlePRERET(FlowInstruction *); 124 void replaceZero(Instruction *); 125 void split64BitOp(Instruction *); 126 127 LValue *r63; 128}; 129 130bool 131NV50LegalizePostRA::visit(Function *fn) 132{ 133 Program *prog = fn->getProgram(); 134 135 r63 = new_LValue(fn, FILE_GPR); 136 r63->reg.data.id = 63; 137 138 // this is actually per-program, but we can do it all on visiting main() 139 std::list<Instruction *> *outWrites = 140 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); 141 142 if (outWrites) { 143 for (std::list<Instruction *>::iterator it = outWrites->begin(); 144 it != outWrites->end(); ++it) 145 (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0)); 146 // instructions will be deleted on exit 147 outWrites->clear(); 148 } 149 150 return true; 151} 152 153void 154NV50LegalizePostRA::replaceZero(Instruction *i) 155{ 156 for (int s = 0; i->srcExists(s); ++s) { 157 ImmediateValue *imm = i->getSrc(s)->asImm(); 158 if (imm && imm->reg.data.u64 == 0) 159 i->setSrc(s, r63); 160 } 161} 162 163void 164NV50LegalizePostRA::split64BitOp(Instruction *i) 165{ 166 if (i->dType == TYPE_F64) { 167 if (i->op == OP_MAD) 168 i->op = OP_FMA; 169 if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA || 170 i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX || 171 i->op == OP_SET) 172 return; 173 i->dType = i->sType = TYPE_U32; 174 175 i->bb->insertAfter(i, cloneForward(func, i)); 176 } 177} 178 179// Emulate PRERET: jump to the target and call to the origin from there 180// 181// WARNING: atm only works if BBs are affected by at most a single PRERET 182// 183// BB:0 184// preret BB:3 185// (...) 186// BB:3 187// (...) 188// ---> 189// BB:0 190// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate) 191// (...) 192// BB:3 193// bra BB:3 + n1 (skip the call) 194// call BB:0 + n2 (skip bra at beginning of BB:0) 195// (...) 196void 197NV50LegalizePostRA::handlePRERET(FlowInstruction *pre) 198{ 199 BasicBlock *bbE = pre->bb; 200 BasicBlock *bbT = pre->target.bb; 201 202 pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0; 203 bbE->remove(pre); 204 bbE->insertHead(pre); 205 206 Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT); 207 Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE); 208 209 bbT->insertHead(call); 210 bbT->insertHead(skip); 211 212 // NOTE: maybe split blocks to prevent the instructions from moving ? 213 214 skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1; 215 call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2; 216} 217 218bool 219NV50LegalizePostRA::visit(BasicBlock *bb) 220{ 221 Instruction *i, *next; 222 223 // remove pseudo operations and non-fixed no-ops, split 64 bit operations 224 for (i = bb->getFirst(); i; i = next) { 225 next = i->next; 226 if (i->isNop()) { 227 bb->remove(i); 228 } else 229 if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) { 230 handlePRERET(i->asFlow()); 231 } else { 232 if (i->op != OP_MOV && i->op != OP_PFETCH && 233 (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS)) 234 replaceZero(i); 235 if (typeSizeof(i->dType) == 8) 236 split64BitOp(i); 237 } 238 } 239 if (!bb->getEntry()) 240 return true; 241 242 return true; 243} 244 245class NV50LegalizeSSA : public Pass 246{ 247public: 248 NV50LegalizeSSA(Program *); 249 250 virtual bool visit(BasicBlock *bb); 251 252private: 253 void propagateWriteToOutput(Instruction *); 254 void handleDIV(Instruction *); 255 void handleMOD(Instruction *); 256 void handleMUL(Instruction *); 257 void handleAddrDef(Instruction *); 258 259 inline bool isARL(const Instruction *) const; 260 261 BuildUtil bld; 262 263 std::list<Instruction *> *outWrites; 264}; 265 266NV50LegalizeSSA::NV50LegalizeSSA(Program *prog) 267{ 268 bld.setProgram(prog); 269 270 if (prog->optLevel >= 2 && 271 (prog->getType() == Program::TYPE_GEOMETRY || 272 prog->getType() == Program::TYPE_VERTEX)) 273 outWrites = 274 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); 275 else 276 outWrites = NULL; 277} 278 279void 280NV50LegalizeSSA::propagateWriteToOutput(Instruction *st) 281{ 282 if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1) 283 return; 284 285 // check def instruction can store 286 Instruction *di = st->getSrc(1)->defs.front()->getInsn(); 287 288 // TODO: move exports (if beneficial) in common opt pass 289 if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1) 290 return; 291 for (int s = 0; di->srcExists(s); ++s) 292 if (di->src(s).getFile() == FILE_IMMEDIATE) 293 return; 294 295 // We cannot set defs to non-lvalues before register allocation, so 296 // save & remove (to save registers) the exports and replace later. 297 outWrites->push_back(st); 298 st->bb->remove(st); 299} 300 301bool 302NV50LegalizeSSA::isARL(const Instruction *i) const 303{ 304 ImmediateValue imm; 305 306 if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR) 307 return false; 308 if (!i->src(1).getImmediate(imm)) 309 return false; 310 return imm.isInteger(0); 311} 312 313void 314NV50LegalizeSSA::handleAddrDef(Instruction *i) 315{ 316 Instruction *arl; 317 318 i->getDef(0)->reg.size = 2; // $aX are only 16 bit 319 320 // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid 321 if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) { 322 if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR) 323 return; 324 if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS) 325 return; 326 } 327 328 // turn $a sources into $r sources (can't operate on $a) 329 for (int s = 0; i->srcExists(s); ++s) { 330 Value *a = i->getSrc(s); 331 Value *r; 332 if (a->reg.file == FILE_ADDRESS) { 333 if (a->getInsn() && isARL(a->getInsn())) { 334 i->setSrc(s, a->getInsn()->getSrc(0)); 335 } else { 336 bld.setPosition(i, false); 337 r = bld.getSSA(); 338 bld.mkMov(r, a); 339 i->setSrc(s, r); 340 } 341 } 342 } 343 if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE) 344 return; 345 346 // turn result back into $a 347 bld.setPosition(i, true); 348 arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0)); 349 i->setDef(0, arl->getSrc(0)); 350} 351 352void 353NV50LegalizeSSA::handleMUL(Instruction *mul) 354{ 355 if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2) 356 return; 357 Value *def = mul->getDef(0); 358 Value *pred = mul->getPredicate(); 359 CondCode cc = mul->cc; 360 if (pred) 361 mul->setPredicate(CC_ALWAYS, NULL); 362 363 if (mul->op == OP_MAD) { 364 Instruction *add = mul; 365 bld.setPosition(add, false); 366 Value *res = cloneShallow(func, mul->getDef(0)); 367 mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1)); 368 add->op = OP_ADD; 369 add->setSrc(0, mul->getDef(0)); 370 add->setSrc(1, add->getSrc(2)); 371 for (int s = 2; add->srcExists(s); ++s) 372 add->setSrc(s, NULL); 373 mul->subOp = add->subOp; 374 add->subOp = 0; 375 } 376 expandIntegerMUL(&bld, mul); 377 if (pred) 378 def->getInsn()->setPredicate(cc, pred); 379} 380 381// Use f32 division: first compute an approximate result, use it to reduce 382// the dividend, which should then be representable as f32, divide the reduced 383// dividend, and add the quotients. 384void 385NV50LegalizeSSA::handleDIV(Instruction *div) 386{ 387 const DataType ty = div->sType; 388 389 if (ty != TYPE_U32 && ty != TYPE_S32) 390 return; 391 392 Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond; 393 394 bld.setPosition(div, false); 395 396 Value *a, *af = bld.getSSA(); 397 Value *b, *bf = bld.getSSA(); 398 399 bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0)); 400 bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1)); 401 402 if (isSignedType(ty)) { 403 af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS); 404 bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS); 405 a = bld.getSSA(); 406 b = bld.getSSA(); 407 bld.mkOp1(OP_ABS, ty, a, div->getSrc(0)); 408 bld.mkOp1(OP_ABS, ty, b, div->getSrc(1)); 409 } else { 410 a = div->getSrc(0); 411 b = div->getSrc(1); 412 } 413 414 bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf); 415 bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2)); 416 417 bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z; 418 bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z; 419 420 // get error of 1st result 421 expandIntegerMUL(&bld, 422 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b)); 423 bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t); 424 425 bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf); 426 427 bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z; 428 bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf) 429 ->rnd = ROUND_Z; 430 bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients 431 432 // correction: if modulus >= divisor, add 1 433 expandIntegerMUL(&bld, 434 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b)); 435 bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t); 436 bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b); 437 if (!isSignedType(ty)) { 438 div->op = OP_SUB; 439 div->setSrc(0, q); 440 div->setSrc(1, s); 441 } else { 442 t = q; 443 bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s); 444 s = bld.getSSA(); 445 t = bld.getSSA(); 446 // fix the sign 447 bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1)) 448 ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS))); 449 bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond); 450 bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond); 451 452 div->op = OP_UNION; 453 div->setSrc(0, s); 454 div->setSrc(1, t); 455 } 456} 457 458void 459NV50LegalizeSSA::handleMOD(Instruction *mod) 460{ 461 if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32) 462 return; 463 bld.setPosition(mod, false); 464 465 Value *q = bld.getSSA(); 466 Value *m = bld.getSSA(); 467 468 bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1)); 469 handleDIV(q->getInsn()); 470 471 bld.setPosition(mod, false); 472 expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1))); 473 474 mod->op = OP_SUB; 475 mod->setSrc(1, m); 476} 477 478bool 479NV50LegalizeSSA::visit(BasicBlock *bb) 480{ 481 Instruction *insn, *next; 482 // skipping PHIs (don't pass them to handleAddrDef) ! 483 for (insn = bb->getEntry(); insn; insn = next) { 484 next = insn->next; 485 486 switch (insn->op) { 487 case OP_EXPORT: 488 if (outWrites) 489 propagateWriteToOutput(insn); 490 break; 491 case OP_DIV: 492 handleDIV(insn); 493 break; 494 case OP_MOD: 495 handleMOD(insn); 496 break; 497 case OP_MAD: 498 case OP_MUL: 499 handleMUL(insn); 500 break; 501 default: 502 break; 503 } 504 505 if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS) 506 handleAddrDef(insn); 507 } 508 return true; 509} 510 511class NV50LoweringPreSSA : public Pass 512{ 513public: 514 NV50LoweringPreSSA(Program *); 515 516private: 517 virtual bool visit(Instruction *); 518 virtual bool visit(Function *); 519 520 bool handleRDSV(Instruction *); 521 bool handleWRSV(Instruction *); 522 523 bool handleEXPORT(Instruction *); 524 525 bool handleDIV(Instruction *); 526 bool handleSQRT(Instruction *); 527 bool handlePOW(Instruction *); 528 529 bool handleSET(Instruction *); 530 bool handleSLCT(CmpInstruction *); 531 bool handleSELP(Instruction *); 532 533 bool handleTEX(TexInstruction *); 534 bool handleTXB(TexInstruction *); // I really 535 bool handleTXL(TexInstruction *); // hate 536 bool handleTXD(TexInstruction *); // these 3 537 538 bool handleCALL(Instruction *); 539 bool handlePRECONT(Instruction *); 540 bool handleCONT(Instruction *); 541 542 void checkPredicate(Instruction *); 543 544private: 545 const Target *const targ; 546 547 BuildUtil bld; 548 549 Value *tid; 550}; 551 552NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) : 553 targ(prog->getTarget()), tid(NULL) 554{ 555 bld.setProgram(prog); 556} 557 558bool 559NV50LoweringPreSSA::visit(Function *f) 560{ 561 BasicBlock *root = BasicBlock::get(func->cfg.getRoot()); 562 563 if (prog->getType() == Program::TYPE_COMPUTE) { 564 // Add implicit "thread id" argument in $r0 to the function 565 Value *arg = new_LValue(func, FILE_GPR); 566 arg->reg.data.id = 0; 567 f->ins.push_back(arg); 568 569 bld.setPosition(root, false); 570 tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0); 571 } 572 573 return true; 574} 575 576// move array source to first slot, convert to u16, add indirections 577bool 578NV50LoweringPreSSA::handleTEX(TexInstruction *i) 579{ 580 const int arg = i->tex.target.getArgCount(); 581 const int dref = arg; 582 const int lod = i->tex.target.isShadow() ? (arg + 1) : arg; 583 584 // dref comes before bias/lod 585 if (i->tex.target.isShadow()) 586 if (i->op == OP_TXB || i->op == OP_TXL) 587 i->swapSources(dref, lod); 588 589 // array index must be converted to u32 590 if (i->tex.target.isArray()) { 591 Value *layer = i->getSrc(arg - 1); 592 LValue *src = new_LValue(func, FILE_GPR); 593 bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer); 594 bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511)); 595 i->setSrc(arg - 1, src); 596 597 if (i->tex.target.isCube()) { 598 // Value *face = layer; 599 Value *x, *y; 600 x = new_LValue(func, FILE_GPR); 601 y = new_LValue(func, FILE_GPR); 602 layer = new_LValue(func, FILE_GPR); 603 604 i->tex.target = TEX_TARGET_2D_ARRAY; 605 606 // TODO: use TEXPREP to convert x,y,z,face -> x,y,layer 607 bld.mkMov(x, i->getSrc(0)); 608 bld.mkMov(y, i->getSrc(1)); 609 bld.mkMov(layer, i->getSrc(3)); 610 611 i->setSrc(0, x); 612 i->setSrc(1, y); 613 i->setSrc(2, layer); 614 i->setSrc(3, i->getSrc(4)); 615 i->setSrc(4, NULL); 616 } 617 } 618 619 // texel offsets are 3 immediate fields in the instruction, 620 // nv50 cannot do textureGatherOffsets 621 assert(i->tex.useOffsets <= 1); 622 623 return true; 624} 625 626// Bias must be equal for all threads of a quad or lod calculation will fail. 627// 628// The lanes of a quad are grouped by the bit in the condition register they 629// have set, which is selected by differing bias values. 630// Move the input values for TEX into a new register set for each group and 631// execute TEX only for a specific group. 632// We always need to use 4 new registers for the inputs/outputs because the 633// implicitly calculated derivatives must be correct. 634// 635// TODO: move to SSA phase so we can easily determine whether bias is constant 636bool 637NV50LoweringPreSSA::handleTXB(TexInstruction *i) 638{ 639 const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O }; 640 int l, d; 641 642 handleTEX(i); 643 Value *bias = i->getSrc(i->tex.target.getArgCount()); 644 if (bias->isUniform()) 645 return true; 646 647 Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(), 648 bld.loadImm(NULL, 1)); 649 bld.setPosition(cond, false); 650 651 for (l = 1; l < 4; ++l) { 652 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR); 653 Value *bit = bld.getSSA(); 654 Value *pred = bld.getScratch(1, FILE_FLAGS); 655 Value *imm = bld.loadImm(NULL, (1 << l)); 656 bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0; 657 bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred); 658 cond->setSrc(l, bit); 659 } 660 Value *flags = bld.getScratch(1, FILE_FLAGS); 661 bld.setPosition(cond, true); 662 bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0)); 663 664 Instruction *tex[4]; 665 for (l = 0; l < 4; ++l) { 666 (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags); 667 bld.insert(tex[l]); 668 } 669 670 Value *res[4][4]; 671 for (d = 0; i->defExists(d); ++d) 672 res[0][d] = tex[0]->getDef(d); 673 for (l = 1; l < 4; ++l) { 674 for (d = 0; tex[l]->defExists(d); ++d) { 675 res[l][d] = cloneShallow(func, res[0][d]); 676 bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags); 677 } 678 } 679 680 for (d = 0; i->defExists(d); ++d) { 681 Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d)); 682 for (l = 0; l < 4; ++l) 683 dst->setSrc(l, res[l][d]); 684 } 685 delete_Instruction(prog, i); 686 return true; 687} 688 689// LOD must be equal for all threads of a quad. 690// Unlike with TXB, here we can just diverge since there's no LOD calculation 691// that would require all 4 threads' sources to be set up properly. 692bool 693NV50LoweringPreSSA::handleTXL(TexInstruction *i) 694{ 695 handleTEX(i); 696 Value *lod = i->getSrc(i->tex.target.getArgCount()); 697 if (lod->isUniform()) 698 return true; 699 700 BasicBlock *currBB = i->bb; 701 BasicBlock *texiBB = i->bb->splitBefore(i, false); 702 BasicBlock *joinBB = i->bb->splitAfter(i); 703 704 bld.setPosition(currBB, true); 705 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); 706 707 for (int l = 0; l <= 3; ++l) { 708 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR); 709 Value *pred = bld.getScratch(1, FILE_FLAGS); 710 bld.setPosition(currBB, true); 711 bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0; 712 bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1; 713 currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD); 714 if (l <= 2) { 715 BasicBlock *laneBB = new BasicBlock(func); 716 currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE); 717 currBB = laneBB; 718 } 719 } 720 bld.setPosition(joinBB, false); 721 bld.mkOp(OP_JOIN, TYPE_NONE, NULL); 722 return true; 723} 724 725bool 726NV50LoweringPreSSA::handleTXD(TexInstruction *i) 727{ 728 static const uint8_t qOps[4][2] = 729 { 730 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 731 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 732 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 733 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 734 }; 735 Value *def[4][4]; 736 Value *crd[3]; 737 Instruction *tex; 738 Value *zero = bld.loadImm(bld.getSSA(), 0); 739 int l, c; 740 const int dim = i->tex.target.getDim(); 741 742 handleTEX(i); 743 i->op = OP_TEX; // no need to clone dPdx/dPdy later 744 745 for (c = 0; c < dim; ++c) 746 crd[c] = bld.getScratch(); 747 748 bld.mkOp(OP_QUADON, TYPE_NONE, NULL); 749 for (l = 0; l < 4; ++l) { 750 // mov coordinates from lane l to all lanes 751 for (c = 0; c < dim; ++c) 752 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero); 753 // add dPdx from lane l to lanes dx 754 for (c = 0; c < dim; ++c) 755 bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]); 756 // add dPdy from lane l to lanes dy 757 for (c = 0; c < dim; ++c) 758 bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); 759 // texture 760 bld.insert(tex = cloneForward(func, i)); 761 for (c = 0; c < dim; ++c) 762 tex->setSrc(c, crd[c]); 763 // save results 764 for (c = 0; i->defExists(c); ++c) { 765 Instruction *mov; 766 def[c][l] = bld.getSSA(); 767 mov = bld.mkMov(def[c][l], tex->getDef(c)); 768 mov->fixed = 1; 769 mov->lanes = 1 << l; 770 } 771 } 772 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); 773 774 for (c = 0; i->defExists(c); ++c) { 775 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); 776 for (l = 0; l < 4; ++l) 777 u->setSrc(l, def[c][l]); 778 } 779 780 i->bb->remove(i); 781 return true; 782} 783 784bool 785NV50LoweringPreSSA::handleSET(Instruction *i) 786{ 787 if (i->dType == TYPE_F32) { 788 bld.setPosition(i, true); 789 i->dType = TYPE_U32; 790 bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0)); 791 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0)); 792 } 793 return true; 794} 795 796bool 797NV50LoweringPreSSA::handleSLCT(CmpInstruction *i) 798{ 799 Value *src0 = bld.getSSA(); 800 Value *src1 = bld.getSSA(); 801 Value *pred = bld.getScratch(1, FILE_FLAGS); 802 803 Value *v0 = i->getSrc(0); 804 Value *v1 = i->getSrc(1); 805 // XXX: these probably shouldn't be immediates in the first place ... 806 if (v0->asImm()) 807 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0); 808 if (v1->asImm()) 809 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0); 810 811 bld.setPosition(i, true); 812 bld.mkMov(src0, v0)->setPredicate(CC_NE, pred); 813 bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred); 814 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1); 815 816 bld.setPosition(i, false); 817 i->op = OP_SET; 818 i->setFlagsDef(0, pred); 819 i->dType = TYPE_U8; 820 i->setSrc(0, i->getSrc(2)); 821 i->setSrc(2, NULL); 822 i->setSrc(1, bld.loadImm(NULL, 0)); 823 824 return true; 825} 826 827bool 828NV50LoweringPreSSA::handleSELP(Instruction *i) 829{ 830 Value *src0 = bld.getSSA(); 831 Value *src1 = bld.getSSA(); 832 833 Value *v0 = i->getSrc(0); 834 Value *v1 = i->getSrc(1); 835 if (v0->asImm()) 836 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0); 837 if (v1->asImm()) 838 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0); 839 840 bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2)); 841 bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2)); 842 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1); 843 delete_Instruction(prog, i); 844 return true; 845} 846 847bool 848NV50LoweringPreSSA::handleWRSV(Instruction *i) 849{ 850 Symbol *sym = i->getSrc(0)->asSym(); 851 852 // these are all shader outputs, $sreg are not writeable 853 uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym); 854 if (addr >= 0x400) 855 return false; 856 sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr); 857 858 bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1)); 859 860 bld.getBB()->remove(i); 861 return true; 862} 863 864bool 865NV50LoweringPreSSA::handleCALL(Instruction *i) 866{ 867 if (prog->getType() == Program::TYPE_COMPUTE) { 868 // Add implicit "thread id" argument in $r0 to the function 869 i->setSrc(i->srcCount(), tid); 870 } 871 return true; 872} 873 874bool 875NV50LoweringPreSSA::handlePRECONT(Instruction *i) 876{ 877 delete_Instruction(prog, i); 878 return true; 879} 880 881bool 882NV50LoweringPreSSA::handleCONT(Instruction *i) 883{ 884 i->op = OP_BRA; 885 return true; 886} 887 888bool 889NV50LoweringPreSSA::handleRDSV(Instruction *i) 890{ 891 Symbol *sym = i->getSrc(0)->asSym(); 892 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym); 893 Value *def = i->getDef(0); 894 SVSemantic sv = sym->reg.data.sv.sv; 895 int idx = sym->reg.data.sv.index; 896 897 if (addr >= 0x400) // mov $sreg 898 return true; 899 900 switch (sv) { 901 case SV_POSITION: 902 assert(prog->getType() == Program::TYPE_FRAGMENT); 903 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL); 904 break; 905 case SV_FACE: 906 bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL); 907 if (i->dType == TYPE_F32) { 908 bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000)); 909 bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000)); 910 } 911 break; 912 case SV_NCTAID: 913 case SV_CTAID: 914 case SV_NTID: 915 if ((sv == SV_NCTAID && idx >= 2) || 916 (sv == SV_NTID && idx >= 3)) { 917 bld.mkMov(def, bld.mkImm(1)); 918 } else if (sv == SV_CTAID && idx >= 2) { 919 bld.mkMov(def, bld.mkImm(0)); 920 } else { 921 Value *x = bld.getSSA(2); 922 bld.mkOp1(OP_LOAD, TYPE_U16, x, 923 bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr)); 924 bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x); 925 } 926 break; 927 case SV_TID: 928 if (idx == 0) { 929 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff)); 930 } else if (idx == 1) { 931 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000)); 932 bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16)); 933 } else if (idx == 2) { 934 bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26)); 935 } else { 936 bld.mkMov(def, bld.mkImm(0)); 937 } 938 break; 939 default: 940 bld.mkFetch(i->getDef(0), i->dType, 941 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL); 942 break; 943 } 944 bld.getBB()->remove(i); 945 return true; 946} 947 948bool 949NV50LoweringPreSSA::handleDIV(Instruction *i) 950{ 951 if (!isFloatType(i->dType)) 952 return true; 953 bld.setPosition(i, false); 954 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1)); 955 i->op = OP_MUL; 956 i->setSrc(1, rcp->getDef(0)); 957 return true; 958} 959 960bool 961NV50LoweringPreSSA::handleSQRT(Instruction *i) 962{ 963 Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, 964 bld.getSSA(), i->getSrc(0)); 965 i->op = OP_MUL; 966 i->setSrc(1, rsq->getDef(0)); 967 968 return true; 969} 970 971bool 972NV50LoweringPreSSA::handlePOW(Instruction *i) 973{ 974 LValue *val = bld.getScratch(); 975 976 bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0)); 977 bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1; 978 bld.mkOp1(OP_PREEX2, TYPE_F32, val, val); 979 980 i->op = OP_EX2; 981 i->setSrc(0, val); 982 i->setSrc(1, NULL); 983 984 return true; 985} 986 987bool 988NV50LoweringPreSSA::handleEXPORT(Instruction *i) 989{ 990 if (prog->getType() == Program::TYPE_FRAGMENT) { 991 if (i->getIndirect(0, 0)) { 992 // TODO: redirect to l[] here, load to GPRs at exit 993 return false; 994 } else { 995 int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units 996 997 i->op = OP_MOV; 998 i->subOp = NV50_IR_SUBOP_MOV_FINAL; 999 i->src(0).set(i->src(1)); 1000 i->setSrc(1, NULL); 1001 i->setDef(0, new_LValue(func, FILE_GPR)); 1002 i->getDef(0)->reg.data.id = id; 1003 1004 prog->maxGPR = MAX2(prog->maxGPR, id); 1005 } 1006 } 1007 return true; 1008} 1009 1010// Set flags according to predicate and make the instruction read $cX. 1011void 1012NV50LoweringPreSSA::checkPredicate(Instruction *insn) 1013{ 1014 Value *pred = insn->getPredicate(); 1015 Value *cdst; 1016 1017 if (!pred || pred->reg.file == FILE_FLAGS) 1018 return; 1019 cdst = bld.getSSA(1, FILE_FLAGS); 1020 1021 bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, cdst, bld.loadImm(NULL, 0), pred); 1022 1023 insn->setPredicate(insn->cc, cdst); 1024} 1025 1026// 1027// - add quadop dance for texturing 1028// - put FP outputs in GPRs 1029// - convert instruction sequences 1030// 1031bool 1032NV50LoweringPreSSA::visit(Instruction *i) 1033{ 1034 bld.setPosition(i, false); 1035 1036 if (i->cc != CC_ALWAYS) 1037 checkPredicate(i); 1038 1039 switch (i->op) { 1040 case OP_TEX: 1041 case OP_TXF: 1042 case OP_TXG: 1043 return handleTEX(i->asTex()); 1044 case OP_TXB: 1045 return handleTXB(i->asTex()); 1046 case OP_TXL: 1047 return handleTXL(i->asTex()); 1048 case OP_TXD: 1049 return handleTXD(i->asTex()); 1050 case OP_EX2: 1051 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); 1052 i->setSrc(0, i->getDef(0)); 1053 break; 1054 case OP_SET: 1055 return handleSET(i); 1056 case OP_SLCT: 1057 return handleSLCT(i->asCmp()); 1058 case OP_SELP: 1059 return handleSELP(i); 1060 case OP_POW: 1061 return handlePOW(i); 1062 case OP_DIV: 1063 return handleDIV(i); 1064 case OP_SQRT: 1065 return handleSQRT(i); 1066 case OP_EXPORT: 1067 return handleEXPORT(i); 1068 case OP_RDSV: 1069 return handleRDSV(i); 1070 case OP_WRSV: 1071 return handleWRSV(i); 1072 case OP_CALL: 1073 return handleCALL(i); 1074 case OP_PRECONT: 1075 return handlePRECONT(i); 1076 case OP_CONT: 1077 return handleCONT(i); 1078 default: 1079 break; 1080 } 1081 return true; 1082} 1083 1084bool 1085TargetNV50::runLegalizePass(Program *prog, CGStage stage) const 1086{ 1087 bool ret = false; 1088 1089 if (stage == CG_STAGE_PRE_SSA) { 1090 NV50LoweringPreSSA pass(prog); 1091 ret = pass.run(prog, false, true); 1092 } else 1093 if (stage == CG_STAGE_SSA) { 1094 if (!prog->targetPriv) 1095 prog->targetPriv = new std::list<Instruction *>(); 1096 NV50LegalizeSSA pass(prog); 1097 ret = pass.run(prog, false, true); 1098 } else 1099 if (stage == CG_STAGE_POST_RA) { 1100 NV50LegalizePostRA pass; 1101 ret = pass.run(prog, false, true); 1102 if (prog->targetPriv) 1103 delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); 1104 } 1105 return ret; 1106} 1107 1108} // namespace nv50_ir 1109