R600ISelLowering.cpp revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for R600 12// 13//===----------------------------------------------------------------------===// 14 15#include "R600ISelLowering.h" 16#include "R600Defines.h" 17#include "R600InstrInfo.h" 18#include "R600MachineFunctionInfo.h" 19#include "llvm/CodeGen/CallingConvLower.h" 20#include "llvm/CodeGen/MachineFrameInfo.h" 21#include "llvm/CodeGen/MachineInstrBuilder.h" 22#include "llvm/CodeGen/MachineRegisterInfo.h" 23#include "llvm/CodeGen/SelectionDAG.h" 24#include "llvm/IR/Argument.h" 25#include "llvm/IR/Function.h" 26 27using namespace llvm; 28 29R600TargetLowering::R600TargetLowering(TargetMachine &TM) : 30 AMDGPUTargetLowering(TM), 31 Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) { 32 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 33 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 34 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 35 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 36 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); 37 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); 38 39 computeRegisterProperties(); 40 41 // Set condition code actions 42 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 43 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 44 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 45 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 46 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 47 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 48 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 49 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 50 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 51 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 52 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 53 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 54 55 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 56 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 57 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 58 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 59 60 setOperationAction(ISD::FCOS, MVT::f32, Custom); 61 setOperationAction(ISD::FSIN, MVT::f32, Custom); 62 63 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 64 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 65 66 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 67 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 68 69 setOperationAction(ISD::FSUB, MVT::f32, Expand); 70 71 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 72 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 73 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 74 75 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 76 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 77 78 setOperationAction(ISD::SETCC, MVT::i32, Expand); 79 setOperationAction(ISD::SETCC, MVT::f32, Expand); 80 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 81 82 setOperationAction(ISD::SELECT, MVT::i32, Expand); 83 setOperationAction(ISD::SELECT, MVT::f32, Expand); 84 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 85 setOperationAction(ISD::SELECT, MVT::v2f32, Expand); 86 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 87 setOperationAction(ISD::SELECT, MVT::v4f32, Expand); 88 89 // Legalize loads and stores to the private address space. 90 setOperationAction(ISD::LOAD, MVT::i32, Custom); 91 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 92 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 93 94 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 95 // spaces, so it is custom lowered to handle those where it isn't. 96 setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); 97 setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); 98 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 99 setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); 100 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 101 setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); 102 103 setOperationAction(ISD::STORE, MVT::i8, Custom); 104 setOperationAction(ISD::STORE, MVT::i32, Custom); 105 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 106 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 107 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 108 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 109 110 setOperationAction(ISD::LOAD, MVT::i32, Custom); 111 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 112 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 113 114 setTargetDAGCombine(ISD::FP_ROUND); 115 setTargetDAGCombine(ISD::FP_TO_SINT); 116 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 117 setTargetDAGCombine(ISD::SELECT_CC); 118 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 119 120 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 121 122 setBooleanContents(ZeroOrNegativeOneBooleanContent); 123 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 124 setSchedulingPreference(Sched::Source); 125} 126 127MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 128 MachineInstr * MI, MachineBasicBlock * BB) const { 129 MachineFunction * MF = BB->getParent(); 130 MachineRegisterInfo &MRI = MF->getRegInfo(); 131 MachineBasicBlock::iterator I = *MI; 132 const R600InstrInfo *TII = 133 static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo()); 134 135 switch (MI->getOpcode()) { 136 default: 137 // Replace LDS_*_RET instruction that don't have any uses with the 138 // equivalent LDS_*_NORET instruction. 139 if (TII->isLDSRetInstr(MI->getOpcode())) { 140 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); 141 assert(DstIdx != -1); 142 MachineInstrBuilder NewMI; 143 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) 144 return BB; 145 146 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 147 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); 148 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { 149 NewMI.addOperand(MI->getOperand(i)); 150 } 151 } else { 152 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 153 } 154 break; 155 case AMDGPU::CLAMP_R600: { 156 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 157 AMDGPU::MOV, 158 MI->getOperand(0).getReg(), 159 MI->getOperand(1).getReg()); 160 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 161 break; 162 } 163 164 case AMDGPU::FABS_R600: { 165 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 166 AMDGPU::MOV, 167 MI->getOperand(0).getReg(), 168 MI->getOperand(1).getReg()); 169 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 170 break; 171 } 172 173 case AMDGPU::FNEG_R600: { 174 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 175 AMDGPU::MOV, 176 MI->getOperand(0).getReg(), 177 MI->getOperand(1).getReg()); 178 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 179 break; 180 } 181 182 case AMDGPU::MASK_WRITE: { 183 unsigned maskedRegister = MI->getOperand(0).getReg(); 184 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 185 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 186 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 187 break; 188 } 189 190 case AMDGPU::MOV_IMM_F32: 191 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 192 MI->getOperand(1).getFPImm()->getValueAPF() 193 .bitcastToAPInt().getZExtValue()); 194 break; 195 case AMDGPU::MOV_IMM_I32: 196 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 197 MI->getOperand(1).getImm()); 198 break; 199 case AMDGPU::CONST_COPY: { 200 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 201 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 202 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, 203 MI->getOperand(1).getImm()); 204 break; 205 } 206 207 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 208 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 209 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 210 unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 211 212 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 213 .addOperand(MI->getOperand(0)) 214 .addOperand(MI->getOperand(1)) 215 .addImm(EOP); // Set End of program bit 216 break; 217 } 218 219 case AMDGPU::TXD: { 220 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 221 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 222 MachineOperand &RID = MI->getOperand(4); 223 MachineOperand &SID = MI->getOperand(5); 224 unsigned TextureId = MI->getOperand(6).getImm(); 225 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 226 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 227 228 switch (TextureId) { 229 case 5: // Rect 230 CTX = CTY = 0; 231 break; 232 case 6: // Shadow1D 233 SrcW = SrcZ; 234 break; 235 case 7: // Shadow2D 236 SrcW = SrcZ; 237 break; 238 case 8: // ShadowRect 239 CTX = CTY = 0; 240 SrcW = SrcZ; 241 break; 242 case 9: // 1DArray 243 SrcZ = SrcY; 244 CTZ = 0; 245 break; 246 case 10: // 2DArray 247 CTZ = 0; 248 break; 249 case 11: // Shadow1DArray 250 SrcZ = SrcY; 251 CTZ = 0; 252 break; 253 case 12: // Shadow2DArray 254 CTZ = 0; 255 break; 256 } 257 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 258 .addOperand(MI->getOperand(3)) 259 .addImm(SrcX) 260 .addImm(SrcY) 261 .addImm(SrcZ) 262 .addImm(SrcW) 263 .addImm(0) 264 .addImm(0) 265 .addImm(0) 266 .addImm(0) 267 .addImm(1) 268 .addImm(2) 269 .addImm(3) 270 .addOperand(RID) 271 .addOperand(SID) 272 .addImm(CTX) 273 .addImm(CTY) 274 .addImm(CTZ) 275 .addImm(CTW); 276 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 277 .addOperand(MI->getOperand(2)) 278 .addImm(SrcX) 279 .addImm(SrcY) 280 .addImm(SrcZ) 281 .addImm(SrcW) 282 .addImm(0) 283 .addImm(0) 284 .addImm(0) 285 .addImm(0) 286 .addImm(1) 287 .addImm(2) 288 .addImm(3) 289 .addOperand(RID) 290 .addOperand(SID) 291 .addImm(CTX) 292 .addImm(CTY) 293 .addImm(CTZ) 294 .addImm(CTW); 295 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 296 .addOperand(MI->getOperand(0)) 297 .addOperand(MI->getOperand(1)) 298 .addImm(SrcX) 299 .addImm(SrcY) 300 .addImm(SrcZ) 301 .addImm(SrcW) 302 .addImm(0) 303 .addImm(0) 304 .addImm(0) 305 .addImm(0) 306 .addImm(1) 307 .addImm(2) 308 .addImm(3) 309 .addOperand(RID) 310 .addOperand(SID) 311 .addImm(CTX) 312 .addImm(CTY) 313 .addImm(CTZ) 314 .addImm(CTW) 315 .addReg(T0, RegState::Implicit) 316 .addReg(T1, RegState::Implicit); 317 break; 318 } 319 320 case AMDGPU::TXD_SHADOW: { 321 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 322 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 323 MachineOperand &RID = MI->getOperand(4); 324 MachineOperand &SID = MI->getOperand(5); 325 unsigned TextureId = MI->getOperand(6).getImm(); 326 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 327 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 328 329 switch (TextureId) { 330 case 5: // Rect 331 CTX = CTY = 0; 332 break; 333 case 6: // Shadow1D 334 SrcW = SrcZ; 335 break; 336 case 7: // Shadow2D 337 SrcW = SrcZ; 338 break; 339 case 8: // ShadowRect 340 CTX = CTY = 0; 341 SrcW = SrcZ; 342 break; 343 case 9: // 1DArray 344 SrcZ = SrcY; 345 CTZ = 0; 346 break; 347 case 10: // 2DArray 348 CTZ = 0; 349 break; 350 case 11: // Shadow1DArray 351 SrcZ = SrcY; 352 CTZ = 0; 353 break; 354 case 12: // Shadow2DArray 355 CTZ = 0; 356 break; 357 } 358 359 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 360 .addOperand(MI->getOperand(3)) 361 .addImm(SrcX) 362 .addImm(SrcY) 363 .addImm(SrcZ) 364 .addImm(SrcW) 365 .addImm(0) 366 .addImm(0) 367 .addImm(0) 368 .addImm(0) 369 .addImm(1) 370 .addImm(2) 371 .addImm(3) 372 .addOperand(RID) 373 .addOperand(SID) 374 .addImm(CTX) 375 .addImm(CTY) 376 .addImm(CTZ) 377 .addImm(CTW); 378 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 379 .addOperand(MI->getOperand(2)) 380 .addImm(SrcX) 381 .addImm(SrcY) 382 .addImm(SrcZ) 383 .addImm(SrcW) 384 .addImm(0) 385 .addImm(0) 386 .addImm(0) 387 .addImm(0) 388 .addImm(1) 389 .addImm(2) 390 .addImm(3) 391 .addOperand(RID) 392 .addOperand(SID) 393 .addImm(CTX) 394 .addImm(CTY) 395 .addImm(CTZ) 396 .addImm(CTW); 397 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 398 .addOperand(MI->getOperand(0)) 399 .addOperand(MI->getOperand(1)) 400 .addImm(SrcX) 401 .addImm(SrcY) 402 .addImm(SrcZ) 403 .addImm(SrcW) 404 .addImm(0) 405 .addImm(0) 406 .addImm(0) 407 .addImm(0) 408 .addImm(1) 409 .addImm(2) 410 .addImm(3) 411 .addOperand(RID) 412 .addOperand(SID) 413 .addImm(CTX) 414 .addImm(CTY) 415 .addImm(CTZ) 416 .addImm(CTW) 417 .addReg(T0, RegState::Implicit) 418 .addReg(T1, RegState::Implicit); 419 break; 420 } 421 422 case AMDGPU::BRANCH: 423 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 424 .addOperand(MI->getOperand(0)); 425 break; 426 427 case AMDGPU::BRANCH_COND_f32: { 428 MachineInstr *NewMI = 429 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 430 AMDGPU::PREDICATE_BIT) 431 .addOperand(MI->getOperand(1)) 432 .addImm(OPCODE_IS_NOT_ZERO) 433 .addImm(0); // Flags 434 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 435 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 436 .addOperand(MI->getOperand(0)) 437 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 438 break; 439 } 440 441 case AMDGPU::BRANCH_COND_i32: { 442 MachineInstr *NewMI = 443 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 444 AMDGPU::PREDICATE_BIT) 445 .addOperand(MI->getOperand(1)) 446 .addImm(OPCODE_IS_NOT_ZERO_INT) 447 .addImm(0); // Flags 448 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 449 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 450 .addOperand(MI->getOperand(0)) 451 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 452 break; 453 } 454 455 case AMDGPU::EG_ExportSwz: 456 case AMDGPU::R600_ExportSwz: { 457 // Instruction is left unmodified if its not the last one of its type 458 bool isLastInstructionOfItsType = true; 459 unsigned InstExportType = MI->getOperand(1).getImm(); 460 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 461 EndBlock = BB->end(); NextExportInst != EndBlock; 462 NextExportInst = std::next(NextExportInst)) { 463 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 464 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 465 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 466 .getImm(); 467 if (CurrentInstExportType == InstExportType) { 468 isLastInstructionOfItsType = false; 469 break; 470 } 471 } 472 } 473 bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 474 if (!EOP && !isLastInstructionOfItsType) 475 return BB; 476 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 477 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 478 .addOperand(MI->getOperand(0)) 479 .addOperand(MI->getOperand(1)) 480 .addOperand(MI->getOperand(2)) 481 .addOperand(MI->getOperand(3)) 482 .addOperand(MI->getOperand(4)) 483 .addOperand(MI->getOperand(5)) 484 .addOperand(MI->getOperand(6)) 485 .addImm(CfInst) 486 .addImm(EOP); 487 break; 488 } 489 case AMDGPU::RETURN: { 490 // RETURN instructions must have the live-out registers as implicit uses, 491 // otherwise they appear dead. 492 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 493 MachineInstrBuilder MIB(*MF, MI); 494 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 495 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 496 return BB; 497 } 498 } 499 500 MI->eraseFromParent(); 501 return BB; 502} 503 504//===----------------------------------------------------------------------===// 505// Custom DAG Lowering Operations 506//===----------------------------------------------------------------------===// 507 508SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 509 MachineFunction &MF = DAG.getMachineFunction(); 510 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 511 switch (Op.getOpcode()) { 512 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 513 case ISD::FCOS: 514 case ISD::FSIN: return LowerTrig(Op, DAG); 515 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 516 case ISD::STORE: return LowerSTORE(Op, DAG); 517 case ISD::LOAD: return LowerLOAD(Op, DAG); 518 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 519 case ISD::INTRINSIC_VOID: { 520 SDValue Chain = Op.getOperand(0); 521 unsigned IntrinsicID = 522 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 523 switch (IntrinsicID) { 524 case AMDGPUIntrinsic::AMDGPU_store_output: { 525 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 526 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 527 MFI->LiveOuts.push_back(Reg); 528 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); 529 } 530 case AMDGPUIntrinsic::R600_store_swizzle: { 531 const SDValue Args[8] = { 532 Chain, 533 Op.getOperand(2), // Export Value 534 Op.getOperand(3), // ArrayBase 535 Op.getOperand(4), // Type 536 DAG.getConstant(0, MVT::i32), // SWZ_X 537 DAG.getConstant(1, MVT::i32), // SWZ_Y 538 DAG.getConstant(2, MVT::i32), // SWZ_Z 539 DAG.getConstant(3, MVT::i32) // SWZ_W 540 }; 541 return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), 542 Args, 8); 543 } 544 545 // default for switch(IntrinsicID) 546 default: break; 547 } 548 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 549 break; 550 } 551 case ISD::INTRINSIC_WO_CHAIN: { 552 unsigned IntrinsicID = 553 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 554 EVT VT = Op.getValueType(); 555 SDLoc DL(Op); 556 switch(IntrinsicID) { 557 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 558 case AMDGPUIntrinsic::R600_load_input: { 559 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 560 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 561 MachineFunction &MF = DAG.getMachineFunction(); 562 MachineRegisterInfo &MRI = MF.getRegInfo(); 563 MRI.addLiveIn(Reg); 564 return DAG.getCopyFromReg(DAG.getEntryNode(), 565 SDLoc(DAG.getEntryNode()), Reg, VT); 566 } 567 568 case AMDGPUIntrinsic::R600_interp_input: { 569 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 570 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 571 MachineSDNode *interp; 572 if (ijb < 0) { 573 const MachineFunction &MF = DAG.getMachineFunction(); 574 const R600InstrInfo *TII = 575 static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo()); 576 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 577 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); 578 return DAG.getTargetExtractSubreg( 579 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 580 DL, MVT::f32, SDValue(interp, 0)); 581 } 582 MachineFunction &MF = DAG.getMachineFunction(); 583 MachineRegisterInfo &MRI = MF.getRegInfo(); 584 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); 585 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); 586 MRI.addLiveIn(RegisterI); 587 MRI.addLiveIn(RegisterJ); 588 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), 589 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); 590 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), 591 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); 592 593 if (slot % 4 < 2) 594 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 595 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 596 RegisterJNode, RegisterINode); 597 else 598 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 599 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 600 RegisterJNode, RegisterINode); 601 return SDValue(interp, slot % 2); 602 } 603 case AMDGPUIntrinsic::R600_interp_xy: 604 case AMDGPUIntrinsic::R600_interp_zw: { 605 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 606 MachineSDNode *interp; 607 SDValue RegisterINode = Op.getOperand(2); 608 SDValue RegisterJNode = Op.getOperand(3); 609 610 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) 611 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 612 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 613 RegisterJNode, RegisterINode); 614 else 615 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 616 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 617 RegisterJNode, RegisterINode); 618 return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, 619 SDValue(interp, 0), SDValue(interp, 1)); 620 } 621 case AMDGPUIntrinsic::R600_tex: 622 case AMDGPUIntrinsic::R600_texc: 623 case AMDGPUIntrinsic::R600_txl: 624 case AMDGPUIntrinsic::R600_txlc: 625 case AMDGPUIntrinsic::R600_txb: 626 case AMDGPUIntrinsic::R600_txbc: 627 case AMDGPUIntrinsic::R600_txf: 628 case AMDGPUIntrinsic::R600_txq: 629 case AMDGPUIntrinsic::R600_ddx: 630 case AMDGPUIntrinsic::R600_ddy: 631 case AMDGPUIntrinsic::R600_ldptr: { 632 unsigned TextureOp; 633 switch (IntrinsicID) { 634 case AMDGPUIntrinsic::R600_tex: 635 TextureOp = 0; 636 break; 637 case AMDGPUIntrinsic::R600_texc: 638 TextureOp = 1; 639 break; 640 case AMDGPUIntrinsic::R600_txl: 641 TextureOp = 2; 642 break; 643 case AMDGPUIntrinsic::R600_txlc: 644 TextureOp = 3; 645 break; 646 case AMDGPUIntrinsic::R600_txb: 647 TextureOp = 4; 648 break; 649 case AMDGPUIntrinsic::R600_txbc: 650 TextureOp = 5; 651 break; 652 case AMDGPUIntrinsic::R600_txf: 653 TextureOp = 6; 654 break; 655 case AMDGPUIntrinsic::R600_txq: 656 TextureOp = 7; 657 break; 658 case AMDGPUIntrinsic::R600_ddx: 659 TextureOp = 8; 660 break; 661 case AMDGPUIntrinsic::R600_ddy: 662 TextureOp = 9; 663 break; 664 case AMDGPUIntrinsic::R600_ldptr: 665 TextureOp = 10; 666 break; 667 default: 668 llvm_unreachable("Unknow Texture Operation"); 669 } 670 671 SDValue TexArgs[19] = { 672 DAG.getConstant(TextureOp, MVT::i32), 673 Op.getOperand(1), 674 DAG.getConstant(0, MVT::i32), 675 DAG.getConstant(1, MVT::i32), 676 DAG.getConstant(2, MVT::i32), 677 DAG.getConstant(3, MVT::i32), 678 Op.getOperand(2), 679 Op.getOperand(3), 680 Op.getOperand(4), 681 DAG.getConstant(0, MVT::i32), 682 DAG.getConstant(1, MVT::i32), 683 DAG.getConstant(2, MVT::i32), 684 DAG.getConstant(3, MVT::i32), 685 Op.getOperand(5), 686 Op.getOperand(6), 687 Op.getOperand(7), 688 Op.getOperand(8), 689 Op.getOperand(9), 690 Op.getOperand(10) 691 }; 692 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19); 693 } 694 case AMDGPUIntrinsic::AMDGPU_dp4: { 695 SDValue Args[8] = { 696 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 697 DAG.getConstant(0, MVT::i32)), 698 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 699 DAG.getConstant(0, MVT::i32)), 700 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 701 DAG.getConstant(1, MVT::i32)), 702 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 703 DAG.getConstant(1, MVT::i32)), 704 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 705 DAG.getConstant(2, MVT::i32)), 706 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 707 DAG.getConstant(2, MVT::i32)), 708 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 709 DAG.getConstant(3, MVT::i32)), 710 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 711 DAG.getConstant(3, MVT::i32)) 712 }; 713 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8); 714 } 715 716 case Intrinsic::r600_read_ngroups_x: 717 return LowerImplicitParameter(DAG, VT, DL, 0); 718 case Intrinsic::r600_read_ngroups_y: 719 return LowerImplicitParameter(DAG, VT, DL, 1); 720 case Intrinsic::r600_read_ngroups_z: 721 return LowerImplicitParameter(DAG, VT, DL, 2); 722 case Intrinsic::r600_read_global_size_x: 723 return LowerImplicitParameter(DAG, VT, DL, 3); 724 case Intrinsic::r600_read_global_size_y: 725 return LowerImplicitParameter(DAG, VT, DL, 4); 726 case Intrinsic::r600_read_global_size_z: 727 return LowerImplicitParameter(DAG, VT, DL, 5); 728 case Intrinsic::r600_read_local_size_x: 729 return LowerImplicitParameter(DAG, VT, DL, 6); 730 case Intrinsic::r600_read_local_size_y: 731 return LowerImplicitParameter(DAG, VT, DL, 7); 732 case Intrinsic::r600_read_local_size_z: 733 return LowerImplicitParameter(DAG, VT, DL, 8); 734 735 case Intrinsic::r600_read_tgid_x: 736 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 737 AMDGPU::T1_X, VT); 738 case Intrinsic::r600_read_tgid_y: 739 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 740 AMDGPU::T1_Y, VT); 741 case Intrinsic::r600_read_tgid_z: 742 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 743 AMDGPU::T1_Z, VT); 744 case Intrinsic::r600_read_tidig_x: 745 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 746 AMDGPU::T0_X, VT); 747 case Intrinsic::r600_read_tidig_y: 748 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 749 AMDGPU::T0_Y, VT); 750 case Intrinsic::r600_read_tidig_z: 751 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 752 AMDGPU::T0_Z, VT); 753 } 754 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 755 break; 756 } 757 } // end switch(Op.getOpcode()) 758 return SDValue(); 759} 760 761void R600TargetLowering::ReplaceNodeResults(SDNode *N, 762 SmallVectorImpl<SDValue> &Results, 763 SelectionDAG &DAG) const { 764 switch (N->getOpcode()) { 765 default: 766 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 767 return; 768 case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 769 return; 770 case ISD::LOAD: { 771 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); 772 Results.push_back(SDValue(Node, 0)); 773 Results.push_back(SDValue(Node, 1)); 774 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode 775 // function 776 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); 777 return; 778 } 779 case ISD::STORE: 780 SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); 781 Results.push_back(SDValue(Node, 0)); 782 return; 783 } 784} 785 786SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 787 // On hw >= R700, COS/SIN input must be between -1. and 1. 788 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 789 EVT VT = Op.getValueType(); 790 SDValue Arg = Op.getOperand(0); 791 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, 792 DAG.getNode(ISD::FADD, SDLoc(Op), VT, 793 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, 794 DAG.getConstantFP(0.15915494309, MVT::f32)), 795 DAG.getConstantFP(0.5, MVT::f32))); 796 unsigned TrigNode; 797 switch (Op.getOpcode()) { 798 case ISD::FCOS: 799 TrigNode = AMDGPUISD::COS_HW; 800 break; 801 case ISD::FSIN: 802 TrigNode = AMDGPUISD::SIN_HW; 803 break; 804 default: 805 llvm_unreachable("Wrong trig opcode"); 806 } 807 SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT, 808 DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart, 809 DAG.getConstantFP(-0.5, MVT::f32))); 810 if (Gen >= AMDGPUSubtarget::R700) 811 return TrigVal; 812 // On R600 hw, COS/SIN input must be between -Pi and Pi. 813 return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal, 814 DAG.getConstantFP(3.14159265359, MVT::f32)); 815} 816 817SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 818 return DAG.getNode( 819 ISD::SETCC, 820 SDLoc(Op), 821 MVT::i1, 822 Op, DAG.getConstantFP(0.0f, MVT::f32), 823 DAG.getCondCode(ISD::SETNE) 824 ); 825} 826 827SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 828 SDLoc DL, 829 unsigned DwordOffset) const { 830 unsigned ByteOffset = DwordOffset * 4; 831 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 832 AMDGPUAS::CONSTANT_BUFFER_0); 833 834 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 835 assert(isInt<16>(ByteOffset)); 836 837 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 838 DAG.getConstant(ByteOffset, MVT::i32), // PTR 839 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 840 false, false, false, 0); 841} 842 843bool R600TargetLowering::isZero(SDValue Op) const { 844 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 845 return Cst->isNullValue(); 846 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 847 return CstFP->isZero(); 848 } else { 849 return false; 850 } 851} 852 853SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 854 SDLoc DL(Op); 855 EVT VT = Op.getValueType(); 856 857 SDValue LHS = Op.getOperand(0); 858 SDValue RHS = Op.getOperand(1); 859 SDValue True = Op.getOperand(2); 860 SDValue False = Op.getOperand(3); 861 SDValue CC = Op.getOperand(4); 862 SDValue Temp; 863 864 // LHS and RHS are guaranteed to be the same value type 865 EVT CompareVT = LHS.getValueType(); 866 867 // Check if we can lower this to a native operation. 868 869 // Try to lower to a SET* instruction: 870 // 871 // SET* can match the following patterns: 872 // 873 // select_cc f32, f32, -1, 0, cc_supported 874 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 875 // select_cc i32, i32, -1, 0, cc_supported 876 // 877 878 // Move hardware True/False values to the correct operand. 879 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 880 ISD::CondCode InverseCC = 881 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 882 if (isHWTrueValue(False) && isHWFalseValue(True)) { 883 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 884 std::swap(False, True); 885 CC = DAG.getCondCode(InverseCC); 886 } else { 887 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 888 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 889 std::swap(False, True); 890 std::swap(LHS, RHS); 891 CC = DAG.getCondCode(SwapInvCC); 892 } 893 } 894 } 895 896 if (isHWTrueValue(True) && isHWFalseValue(False) && 897 (CompareVT == VT || VT == MVT::i32)) { 898 // This can be matched by a SET* instruction. 899 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 900 } 901 902 // Try to lower to a CND* instruction: 903 // 904 // CND* can match the following patterns: 905 // 906 // select_cc f32, 0.0, f32, f32, cc_supported 907 // select_cc f32, 0.0, i32, i32, cc_supported 908 // select_cc i32, 0, f32, f32, cc_supported 909 // select_cc i32, 0, i32, i32, cc_supported 910 // 911 912 // Try to move the zero value to the RHS 913 if (isZero(LHS)) { 914 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 915 // Try swapping the operands 916 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 917 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 918 std::swap(LHS, RHS); 919 CC = DAG.getCondCode(CCSwapped); 920 } else { 921 // Try inverting the conditon and then swapping the operands 922 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); 923 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 924 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 925 std::swap(True, False); 926 std::swap(LHS, RHS); 927 CC = DAG.getCondCode(CCSwapped); 928 } 929 } 930 } 931 if (isZero(RHS)) { 932 SDValue Cond = LHS; 933 SDValue Zero = RHS; 934 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 935 if (CompareVT != VT) { 936 // Bitcast True / False to the correct types. This will end up being 937 // a nop, but it allows us to define only a single pattern in the 938 // .TD files for each CND* instruction rather than having to have 939 // one pattern for integer True/False and one for fp True/False 940 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 941 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 942 } 943 944 switch (CCOpcode) { 945 case ISD::SETONE: 946 case ISD::SETUNE: 947 case ISD::SETNE: 948 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 949 Temp = True; 950 True = False; 951 False = Temp; 952 break; 953 default: 954 break; 955 } 956 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 957 Cond, Zero, 958 True, False, 959 DAG.getCondCode(CCOpcode)); 960 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 961 } 962 963 964 // Possible Min/Max pattern 965 SDValue MinMax = LowerMinMax(Op, DAG); 966 if (MinMax.getNode()) { 967 return MinMax; 968 } 969 970 // If we make it this for it means we have no native instructions to handle 971 // this SELECT_CC, so we must lower it. 972 SDValue HWTrue, HWFalse; 973 974 if (CompareVT == MVT::f32) { 975 HWTrue = DAG.getConstantFP(1.0f, CompareVT); 976 HWFalse = DAG.getConstantFP(0.0f, CompareVT); 977 } else if (CompareVT == MVT::i32) { 978 HWTrue = DAG.getConstant(-1, CompareVT); 979 HWFalse = DAG.getConstant(0, CompareVT); 980 } 981 else { 982 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 983 } 984 985 // Lower this unsupported SELECT_CC into a combination of two supported 986 // SELECT_CC operations. 987 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 988 989 return DAG.getNode(ISD::SELECT_CC, DL, VT, 990 Cond, HWFalse, 991 True, False, 992 DAG.getCondCode(ISD::SETNE)); 993} 994 995/// LLVM generates byte-addressed pointers. For indirect addressing, we need to 996/// convert these pointers to a register index. Each register holds 997/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 998/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 999/// for indirect addressing. 1000SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 1001 unsigned StackWidth, 1002 SelectionDAG &DAG) const { 1003 unsigned SRLPad; 1004 switch(StackWidth) { 1005 case 1: 1006 SRLPad = 2; 1007 break; 1008 case 2: 1009 SRLPad = 3; 1010 break; 1011 case 4: 1012 SRLPad = 4; 1013 break; 1014 default: llvm_unreachable("Invalid stack width"); 1015 } 1016 1017 return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr, 1018 DAG.getConstant(SRLPad, MVT::i32)); 1019} 1020 1021void R600TargetLowering::getStackAddress(unsigned StackWidth, 1022 unsigned ElemIdx, 1023 unsigned &Channel, 1024 unsigned &PtrIncr) const { 1025 switch (StackWidth) { 1026 default: 1027 case 1: 1028 Channel = 0; 1029 if (ElemIdx > 0) { 1030 PtrIncr = 1; 1031 } else { 1032 PtrIncr = 0; 1033 } 1034 break; 1035 case 2: 1036 Channel = ElemIdx % 2; 1037 if (ElemIdx == 2) { 1038 PtrIncr = 1; 1039 } else { 1040 PtrIncr = 0; 1041 } 1042 break; 1043 case 4: 1044 Channel = ElemIdx; 1045 PtrIncr = 0; 1046 break; 1047 } 1048} 1049 1050SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1051 SDLoc DL(Op); 1052 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1053 SDValue Chain = Op.getOperand(0); 1054 SDValue Value = Op.getOperand(1); 1055 SDValue Ptr = Op.getOperand(2); 1056 1057 SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1058 if (Result.getNode()) { 1059 return Result; 1060 } 1061 1062 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { 1063 if (StoreNode->isTruncatingStore()) { 1064 EVT VT = Value.getValueType(); 1065 assert(VT.bitsLE(MVT::i32)); 1066 EVT MemVT = StoreNode->getMemoryVT(); 1067 SDValue MaskConstant; 1068 if (MemVT == MVT::i8) { 1069 MaskConstant = DAG.getConstant(0xFF, MVT::i32); 1070 } else { 1071 assert(MemVT == MVT::i16); 1072 MaskConstant = DAG.getConstant(0xFFFF, MVT::i32); 1073 } 1074 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, 1075 DAG.getConstant(2, MVT::i32)); 1076 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, 1077 DAG.getConstant(0x00000003, VT)); 1078 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1079 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1080 DAG.getConstant(3, VT)); 1081 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); 1082 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); 1083 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1084 // vector instead. 1085 SDValue Src[4] = { 1086 ShiftedValue, 1087 DAG.getConstant(0, MVT::i32), 1088 DAG.getConstant(0, MVT::i32), 1089 Mask 1090 }; 1091 SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4); 1092 SDValue Args[3] = { Chain, Input, DWordAddr }; 1093 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1094 Op->getVTList(), Args, 3, MemVT, 1095 StoreNode->getMemOperand()); 1096 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && 1097 Value.getValueType().bitsGE(MVT::i32)) { 1098 // Convert pointer from byte address to dword address. 1099 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 1100 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 1101 Ptr, DAG.getConstant(2, MVT::i32))); 1102 1103 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 1104 llvm_unreachable("Truncated and indexed stores not supported yet"); 1105 } else { 1106 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1107 } 1108 return Chain; 1109 } 1110 } 1111 1112 EVT ValueVT = Value.getValueType(); 1113 1114 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1115 return SDValue(); 1116 } 1117 1118 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1119 if (Ret.getNode()) { 1120 return Ret; 1121 } 1122 // Lowering for indirect addressing 1123 1124 const MachineFunction &MF = DAG.getMachineFunction(); 1125 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 1126 getTargetMachine().getFrameLowering()); 1127 unsigned StackWidth = TFL->getStackWidth(MF); 1128 1129 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1130 1131 if (ValueVT.isVector()) { 1132 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1133 EVT ElemVT = ValueVT.getVectorElementType(); 1134 SDValue Stores[4]; 1135 1136 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1137 "vector width in load"); 1138 1139 for (unsigned i = 0; i < NumElemVT; ++i) { 1140 unsigned Channel, PtrIncr; 1141 getStackAddress(StackWidth, i, Channel, PtrIncr); 1142 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1143 DAG.getConstant(PtrIncr, MVT::i32)); 1144 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1145 Value, DAG.getConstant(i, MVT::i32)); 1146 1147 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1148 Chain, Elem, Ptr, 1149 DAG.getTargetConstant(Channel, MVT::i32)); 1150 } 1151 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); 1152 } else { 1153 if (ValueVT == MVT::i8) { 1154 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1155 } 1156 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1157 DAG.getTargetConstant(0, MVT::i32)); // Channel 1158 } 1159 1160 return Chain; 1161} 1162 1163// return (512 + (kc_bank << 12) 1164static int 1165ConstantAddressBlock(unsigned AddressSpace) { 1166 switch (AddressSpace) { 1167 case AMDGPUAS::CONSTANT_BUFFER_0: 1168 return 512; 1169 case AMDGPUAS::CONSTANT_BUFFER_1: 1170 return 512 + 4096; 1171 case AMDGPUAS::CONSTANT_BUFFER_2: 1172 return 512 + 4096 * 2; 1173 case AMDGPUAS::CONSTANT_BUFFER_3: 1174 return 512 + 4096 * 3; 1175 case AMDGPUAS::CONSTANT_BUFFER_4: 1176 return 512 + 4096 * 4; 1177 case AMDGPUAS::CONSTANT_BUFFER_5: 1178 return 512 + 4096 * 5; 1179 case AMDGPUAS::CONSTANT_BUFFER_6: 1180 return 512 + 4096 * 6; 1181 case AMDGPUAS::CONSTANT_BUFFER_7: 1182 return 512 + 4096 * 7; 1183 case AMDGPUAS::CONSTANT_BUFFER_8: 1184 return 512 + 4096 * 8; 1185 case AMDGPUAS::CONSTANT_BUFFER_9: 1186 return 512 + 4096 * 9; 1187 case AMDGPUAS::CONSTANT_BUFFER_10: 1188 return 512 + 4096 * 10; 1189 case AMDGPUAS::CONSTANT_BUFFER_11: 1190 return 512 + 4096 * 11; 1191 case AMDGPUAS::CONSTANT_BUFFER_12: 1192 return 512 + 4096 * 12; 1193 case AMDGPUAS::CONSTANT_BUFFER_13: 1194 return 512 + 4096 * 13; 1195 case AMDGPUAS::CONSTANT_BUFFER_14: 1196 return 512 + 4096 * 14; 1197 case AMDGPUAS::CONSTANT_BUFFER_15: 1198 return 512 + 4096 * 15; 1199 default: 1200 return -1; 1201 } 1202} 1203 1204SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 1205{ 1206 EVT VT = Op.getValueType(); 1207 SDLoc DL(Op); 1208 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1209 SDValue Chain = Op.getOperand(0); 1210 SDValue Ptr = Op.getOperand(1); 1211 SDValue LoweredLoad; 1212 1213 SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); 1214 if (Ret.getNode()) { 1215 SDValue Ops[2]; 1216 Ops[0] = Ret; 1217 Ops[1] = Chain; 1218 return DAG.getMergeValues(Ops, 2, DL); 1219 } 1220 1221 1222 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { 1223 SDValue MergedValues[2] = { 1224 SplitVectorLoad(Op, DAG), 1225 Chain 1226 }; 1227 return DAG.getMergeValues(MergedValues, 2, DL); 1228 } 1229 1230 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1231 if (ConstantBlock > -1 && 1232 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1233 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1234 SDValue Result; 1235 if (isa<ConstantExpr>(LoadNode->getSrcValue()) || 1236 isa<Constant>(LoadNode->getSrcValue()) || 1237 isa<ConstantSDNode>(Ptr)) { 1238 SDValue Slots[4]; 1239 for (unsigned i = 0; i < 4; i++) { 1240 // We want Const position encoded with the following formula : 1241 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1242 // const_index is Ptr computed by llvm using an alignment of 16. 1243 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1244 // then div by 4 at the ISel step 1245 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1246 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); 1247 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1248 } 1249 EVT NewVT = MVT::v4i32; 1250 unsigned NumElements = 4; 1251 if (VT.isVector()) { 1252 NewVT = VT; 1253 NumElements = VT.getVectorNumElements(); 1254 } 1255 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements); 1256 } else { 1257 // non-constant ptr can't be folded, keeps it as a v4f32 load 1258 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1259 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), 1260 DAG.getConstant(LoadNode->getAddressSpace() - 1261 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) 1262 ); 1263 } 1264 1265 if (!VT.isVector()) { 1266 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1267 DAG.getConstant(0, MVT::i32)); 1268 } 1269 1270 SDValue MergedValues[2] = { 1271 Result, 1272 Chain 1273 }; 1274 return DAG.getMergeValues(MergedValues, 2, DL); 1275 } 1276 1277 // For most operations returning SDValue() will result in the node being 1278 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1279 // need to manually expand loads that may be legal in some address spaces and 1280 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1281 // compute shaders, since the data is sign extended when it is uploaded to the 1282 // buffer. However SEXT loads from other address spaces are not supported, so 1283 // we need to expand them here. 1284 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1285 EVT MemVT = LoadNode->getMemoryVT(); 1286 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1287 SDValue ShiftAmount = 1288 DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32); 1289 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, 1290 LoadNode->getPointerInfo(), MemVT, 1291 LoadNode->isVolatile(), 1292 LoadNode->isNonTemporal(), 1293 LoadNode->getAlignment()); 1294 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount); 1295 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount); 1296 1297 SDValue MergedValues[2] = { Sra, Chain }; 1298 return DAG.getMergeValues(MergedValues, 2, DL); 1299 } 1300 1301 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1302 return SDValue(); 1303 } 1304 1305 // Lowering for indirect addressing 1306 const MachineFunction &MF = DAG.getMachineFunction(); 1307 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 1308 getTargetMachine().getFrameLowering()); 1309 unsigned StackWidth = TFL->getStackWidth(MF); 1310 1311 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1312 1313 if (VT.isVector()) { 1314 unsigned NumElemVT = VT.getVectorNumElements(); 1315 EVT ElemVT = VT.getVectorElementType(); 1316 SDValue Loads[4]; 1317 1318 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1319 "vector width in load"); 1320 1321 for (unsigned i = 0; i < NumElemVT; ++i) { 1322 unsigned Channel, PtrIncr; 1323 getStackAddress(StackWidth, i, Channel, PtrIncr); 1324 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1325 DAG.getConstant(PtrIncr, MVT::i32)); 1326 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1327 Chain, Ptr, 1328 DAG.getTargetConstant(Channel, MVT::i32), 1329 Op.getOperand(2)); 1330 } 1331 for (unsigned i = NumElemVT; i < 4; ++i) { 1332 Loads[i] = DAG.getUNDEF(ElemVT); 1333 } 1334 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 1335 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); 1336 } else { 1337 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1338 Chain, Ptr, 1339 DAG.getTargetConstant(0, MVT::i32), // Channel 1340 Op.getOperand(2)); 1341 } 1342 1343 SDValue Ops[2]; 1344 Ops[0] = LoweredLoad; 1345 Ops[1] = Chain; 1346 1347 return DAG.getMergeValues(Ops, 2, DL); 1348} 1349 1350/// XXX Only kernel functions are supported, so we can assume for now that 1351/// every function is a kernel function, but in the future we should use 1352/// separate calling conventions for kernel and non-kernel functions. 1353SDValue R600TargetLowering::LowerFormalArguments( 1354 SDValue Chain, 1355 CallingConv::ID CallConv, 1356 bool isVarArg, 1357 const SmallVectorImpl<ISD::InputArg> &Ins, 1358 SDLoc DL, SelectionDAG &DAG, 1359 SmallVectorImpl<SDValue> &InVals) const { 1360 SmallVector<CCValAssign, 16> ArgLocs; 1361 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1362 getTargetMachine(), ArgLocs, *DAG.getContext()); 1363 MachineFunction &MF = DAG.getMachineFunction(); 1364 unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType; 1365 1366 SmallVector<ISD::InputArg, 8> LocalIns; 1367 1368 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 1369 LocalIns); 1370 1371 AnalyzeFormalArguments(CCInfo, LocalIns); 1372 1373 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1374 CCValAssign &VA = ArgLocs[i]; 1375 EVT VT = Ins[i].VT; 1376 EVT MemVT = LocalIns[i].VT; 1377 1378 if (ShaderType != ShaderType::COMPUTE) { 1379 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); 1380 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1381 InVals.push_back(Register); 1382 continue; 1383 } 1384 1385 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1386 AMDGPUAS::CONSTANT_BUFFER_0); 1387 1388 // i64 isn't a legal type, so the register type used ends up as i32, which 1389 // isn't expected here. It attempts to create this sextload, but it ends up 1390 // being invalid. Somehow this seems to work with i64 arguments, but breaks 1391 // for <1 x i64>. 1392 1393 // The first 36 bytes of the input buffer contains information about 1394 // thread group and global sizes. 1395 SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, 1396 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32), 1397 MachinePointerInfo(UndefValue::get(PtrTy)), 1398 MemVT, false, false, 4); 1399 // 4 is the preferred alignment for 1400 // the CONSTANT memory space. 1401 InVals.push_back(Arg); 1402 } 1403 return Chain; 1404} 1405 1406EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1407 if (!VT.isVector()) return MVT::i32; 1408 return VT.changeVectorElementTypeToInteger(); 1409} 1410 1411static SDValue 1412CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry, 1413 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1414 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1415 assert(RemapSwizzle.empty()); 1416 SDValue NewBldVec[4] = { 1417 VectorEntry.getOperand(0), 1418 VectorEntry.getOperand(1), 1419 VectorEntry.getOperand(2), 1420 VectorEntry.getOperand(3) 1421 }; 1422 1423 for (unsigned i = 0; i < 4; i++) { 1424 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1425 // We mask write here to teach later passes that the ith element of this 1426 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1427 // break false dependencies and additionnaly make assembly easier to read. 1428 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1429 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1430 if (C->isZero()) { 1431 RemapSwizzle[i] = 4; // SEL_0 1432 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1433 } else if (C->isExactlyValue(1.0)) { 1434 RemapSwizzle[i] = 5; // SEL_1 1435 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1436 } 1437 } 1438 1439 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1440 continue; 1441 for (unsigned j = 0; j < i; j++) { 1442 if (NewBldVec[i] == NewBldVec[j]) { 1443 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1444 RemapSwizzle[i] = j; 1445 break; 1446 } 1447 } 1448 } 1449 1450 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1451 VectorEntry.getValueType(), NewBldVec, 4); 1452} 1453 1454static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1455 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1456 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1457 assert(RemapSwizzle.empty()); 1458 SDValue NewBldVec[4] = { 1459 VectorEntry.getOperand(0), 1460 VectorEntry.getOperand(1), 1461 VectorEntry.getOperand(2), 1462 VectorEntry.getOperand(3) 1463 }; 1464 bool isUnmovable[4] = { false, false, false, false }; 1465 for (unsigned i = 0; i < 4; i++) { 1466 RemapSwizzle[i] = i; 1467 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1468 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1469 ->getZExtValue(); 1470 if (i == Idx) 1471 isUnmovable[Idx] = true; 1472 } 1473 } 1474 1475 for (unsigned i = 0; i < 4; i++) { 1476 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1477 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1478 ->getZExtValue(); 1479 if (isUnmovable[Idx]) 1480 continue; 1481 // Swap i and Idx 1482 std::swap(NewBldVec[Idx], NewBldVec[i]); 1483 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1484 break; 1485 } 1486 } 1487 1488 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1489 VectorEntry.getValueType(), NewBldVec, 4); 1490} 1491 1492 1493SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, 1494SDValue Swz[4], SelectionDAG &DAG) const { 1495 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 1496 // Old -> New swizzle values 1497 DenseMap<unsigned, unsigned> SwizzleRemap; 1498 1499 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1500 for (unsigned i = 0; i < 4; i++) { 1501 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1502 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1503 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1504 } 1505 1506 SwizzleRemap.clear(); 1507 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1508 for (unsigned i = 0; i < 4; i++) { 1509 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1510 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1511 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1512 } 1513 1514 return BuildVector; 1515} 1516 1517 1518//===----------------------------------------------------------------------===// 1519// Custom DAG Optimizations 1520//===----------------------------------------------------------------------===// 1521 1522SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1523 DAGCombinerInfo &DCI) const { 1524 SelectionDAG &DAG = DCI.DAG; 1525 1526 switch (N->getOpcode()) { 1527 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1528 case ISD::FP_ROUND: { 1529 SDValue Arg = N->getOperand(0); 1530 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1531 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1532 Arg.getOperand(0)); 1533 } 1534 break; 1535 } 1536 1537 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1538 // (i32 select_cc f32, f32, -1, 0 cc) 1539 // 1540 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1541 // this to one of the SET*_DX10 instructions. 1542 case ISD::FP_TO_SINT: { 1543 SDValue FNeg = N->getOperand(0); 1544 if (FNeg.getOpcode() != ISD::FNEG) { 1545 return SDValue(); 1546 } 1547 SDValue SelectCC = FNeg.getOperand(0); 1548 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1549 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1550 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1551 !isHWTrueValue(SelectCC.getOperand(2)) || 1552 !isHWFalseValue(SelectCC.getOperand(3))) { 1553 return SDValue(); 1554 } 1555 1556 return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), 1557 SelectCC.getOperand(0), // LHS 1558 SelectCC.getOperand(1), // RHS 1559 DAG.getConstant(-1, MVT::i32), // True 1560 DAG.getConstant(0, MVT::i32), // Flase 1561 SelectCC.getOperand(4)); // CC 1562 1563 break; 1564 } 1565 1566 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1567 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1568 case ISD::INSERT_VECTOR_ELT: { 1569 SDValue InVec = N->getOperand(0); 1570 SDValue InVal = N->getOperand(1); 1571 SDValue EltNo = N->getOperand(2); 1572 SDLoc dl(N); 1573 1574 // If the inserted element is an UNDEF, just use the input vector. 1575 if (InVal.getOpcode() == ISD::UNDEF) 1576 return InVec; 1577 1578 EVT VT = InVec.getValueType(); 1579 1580 // If we can't generate a legal BUILD_VECTOR, exit 1581 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 1582 return SDValue(); 1583 1584 // Check that we know which element is being inserted 1585 if (!isa<ConstantSDNode>(EltNo)) 1586 return SDValue(); 1587 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 1588 1589 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 1590 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 1591 // vector elements. 1592 SmallVector<SDValue, 8> Ops; 1593 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 1594 Ops.append(InVec.getNode()->op_begin(), 1595 InVec.getNode()->op_end()); 1596 } else if (InVec.getOpcode() == ISD::UNDEF) { 1597 unsigned NElts = VT.getVectorNumElements(); 1598 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 1599 } else { 1600 return SDValue(); 1601 } 1602 1603 // Insert the element 1604 if (Elt < Ops.size()) { 1605 // All the operands of BUILD_VECTOR must have the same type; 1606 // we enforce that here. 1607 EVT OpVT = Ops[0].getValueType(); 1608 if (InVal.getValueType() != OpVT) 1609 InVal = OpVT.bitsGT(InVal.getValueType()) ? 1610 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : 1611 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); 1612 Ops[Elt] = InVal; 1613 } 1614 1615 // Return the new vector 1616 return DAG.getNode(ISD::BUILD_VECTOR, dl, 1617 VT, &Ops[0], Ops.size()); 1618 } 1619 1620 // Extract_vec (Build_vector) generated by custom lowering 1621 // also needs to be customly combined 1622 case ISD::EXTRACT_VECTOR_ELT: { 1623 SDValue Arg = N->getOperand(0); 1624 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1625 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1626 unsigned Element = Const->getZExtValue(); 1627 return Arg->getOperand(Element); 1628 } 1629 } 1630 if (Arg.getOpcode() == ISD::BITCAST && 1631 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 1632 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1633 unsigned Element = Const->getZExtValue(); 1634 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 1635 Arg->getOperand(0).getOperand(Element)); 1636 } 1637 } 1638 } 1639 1640 case ISD::SELECT_CC: { 1641 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1642 // selectcc x, y, a, b, inv(cc) 1643 // 1644 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1645 // selectcc x, y, a, b, cc 1646 SDValue LHS = N->getOperand(0); 1647 if (LHS.getOpcode() != ISD::SELECT_CC) { 1648 return SDValue(); 1649 } 1650 1651 SDValue RHS = N->getOperand(1); 1652 SDValue True = N->getOperand(2); 1653 SDValue False = N->getOperand(3); 1654 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1655 1656 if (LHS.getOperand(2).getNode() != True.getNode() || 1657 LHS.getOperand(3).getNode() != False.getNode() || 1658 RHS.getNode() != False.getNode()) { 1659 return SDValue(); 1660 } 1661 1662 switch (NCC) { 1663 default: return SDValue(); 1664 case ISD::SETNE: return LHS; 1665 case ISD::SETEQ: { 1666 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1667 LHSCC = ISD::getSetCCInverse(LHSCC, 1668 LHS.getOperand(0).getValueType().isInteger()); 1669 if (DCI.isBeforeLegalizeOps() || 1670 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 1671 return DAG.getSelectCC(SDLoc(N), 1672 LHS.getOperand(0), 1673 LHS.getOperand(1), 1674 LHS.getOperand(2), 1675 LHS.getOperand(3), 1676 LHSCC); 1677 break; 1678 } 1679 } 1680 return SDValue(); 1681 } 1682 1683 case AMDGPUISD::EXPORT: { 1684 SDValue Arg = N->getOperand(1); 1685 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1686 break; 1687 1688 SDValue NewArgs[8] = { 1689 N->getOperand(0), // Chain 1690 SDValue(), 1691 N->getOperand(2), // ArrayBase 1692 N->getOperand(3), // Type 1693 N->getOperand(4), // SWZ_X 1694 N->getOperand(5), // SWZ_Y 1695 N->getOperand(6), // SWZ_Z 1696 N->getOperand(7) // SWZ_W 1697 }; 1698 SDLoc DL(N); 1699 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG); 1700 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8); 1701 } 1702 case AMDGPUISD::TEXTURE_FETCH: { 1703 SDValue Arg = N->getOperand(1); 1704 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1705 break; 1706 1707 SDValue NewArgs[19] = { 1708 N->getOperand(0), 1709 N->getOperand(1), 1710 N->getOperand(2), 1711 N->getOperand(3), 1712 N->getOperand(4), 1713 N->getOperand(5), 1714 N->getOperand(6), 1715 N->getOperand(7), 1716 N->getOperand(8), 1717 N->getOperand(9), 1718 N->getOperand(10), 1719 N->getOperand(11), 1720 N->getOperand(12), 1721 N->getOperand(13), 1722 N->getOperand(14), 1723 N->getOperand(15), 1724 N->getOperand(16), 1725 N->getOperand(17), 1726 N->getOperand(18), 1727 }; 1728 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG); 1729 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(), 1730 NewArgs, 19); 1731 } 1732 } 1733 return SDValue(); 1734} 1735 1736static bool 1737FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, 1738 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { 1739 const R600InstrInfo *TII = 1740 static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); 1741 if (!Src.isMachineOpcode()) 1742 return false; 1743 switch (Src.getMachineOpcode()) { 1744 case AMDGPU::FNEG_R600: 1745 if (!Neg.getNode()) 1746 return false; 1747 Src = Src.getOperand(0); 1748 Neg = DAG.getTargetConstant(1, MVT::i32); 1749 return true; 1750 case AMDGPU::FABS_R600: 1751 if (!Abs.getNode()) 1752 return false; 1753 Src = Src.getOperand(0); 1754 Abs = DAG.getTargetConstant(1, MVT::i32); 1755 return true; 1756 case AMDGPU::CONST_COPY: { 1757 unsigned Opcode = ParentNode->getMachineOpcode(); 1758 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 1759 1760 if (!Sel.getNode()) 1761 return false; 1762 1763 SDValue CstOffset = Src.getOperand(0); 1764 if (ParentNode->getValueType(0).isVector()) 1765 return false; 1766 1767 // Gather constants values 1768 int SrcIndices[] = { 1769 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 1770 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 1771 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), 1772 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 1773 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 1774 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 1775 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 1776 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 1777 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 1778 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 1779 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 1780 }; 1781 std::vector<unsigned> Consts; 1782 for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) { 1783 int OtherSrcIdx = SrcIndices[i]; 1784 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 1785 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 1786 continue; 1787 if (HasDst) { 1788 OtherSrcIdx--; 1789 OtherSelIdx--; 1790 } 1791 if (RegisterSDNode *Reg = 1792 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 1793 if (Reg->getReg() == AMDGPU::ALU_CONST) { 1794 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>( 1795 ParentNode->getOperand(OtherSelIdx)); 1796 Consts.push_back(Cst->getZExtValue()); 1797 } 1798 } 1799 } 1800 1801 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset); 1802 Consts.push_back(Cst->getZExtValue()); 1803 if (!TII->fitsConstReadLimitations(Consts)) { 1804 return false; 1805 } 1806 1807 Sel = CstOffset; 1808 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); 1809 return true; 1810 } 1811 case AMDGPU::MOV_IMM_I32: 1812 case AMDGPU::MOV_IMM_F32: { 1813 unsigned ImmReg = AMDGPU::ALU_LITERAL_X; 1814 uint64_t ImmValue = 0; 1815 1816 1817 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { 1818 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 1819 float FloatValue = FPC->getValueAPF().convertToFloat(); 1820 if (FloatValue == 0.0) { 1821 ImmReg = AMDGPU::ZERO; 1822 } else if (FloatValue == 0.5) { 1823 ImmReg = AMDGPU::HALF; 1824 } else if (FloatValue == 1.0) { 1825 ImmReg = AMDGPU::ONE; 1826 } else { 1827 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 1828 } 1829 } else { 1830 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); 1831 uint64_t Value = C->getZExtValue(); 1832 if (Value == 0) { 1833 ImmReg = AMDGPU::ZERO; 1834 } else if (Value == 1) { 1835 ImmReg = AMDGPU::ONE_INT; 1836 } else { 1837 ImmValue = Value; 1838 } 1839 } 1840 1841 // Check that we aren't already using an immediate. 1842 // XXX: It's possible for an instruction to have more than one 1843 // immediate operand, but this is not supported yet. 1844 if (ImmReg == AMDGPU::ALU_LITERAL_X) { 1845 if (!Imm.getNode()) 1846 return false; 1847 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); 1848 assert(C); 1849 if (C->getZExtValue()) 1850 return false; 1851 Imm = DAG.getTargetConstant(ImmValue, MVT::i32); 1852 } 1853 Src = DAG.getRegister(ImmReg, MVT::i32); 1854 return true; 1855 } 1856 default: 1857 return false; 1858 } 1859} 1860 1861 1862/// \brief Fold the instructions after selecting them 1863SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 1864 SelectionDAG &DAG) const { 1865 const R600InstrInfo *TII = 1866 static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); 1867 if (!Node->isMachineOpcode()) 1868 return Node; 1869 unsigned Opcode = Node->getMachineOpcode(); 1870 SDValue FakeOp; 1871 1872 std::vector<SDValue> Ops; 1873 for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end(); 1874 I != E; ++I) 1875 Ops.push_back(*I); 1876 1877 if (Opcode == AMDGPU::DOT_4) { 1878 int OperandIdx[] = { 1879 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 1880 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 1881 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 1882 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 1883 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 1884 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 1885 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 1886 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 1887 }; 1888 int NegIdx[] = { 1889 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), 1890 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), 1891 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), 1892 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), 1893 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), 1894 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), 1895 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), 1896 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) 1897 }; 1898 int AbsIdx[] = { 1899 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), 1900 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), 1901 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), 1902 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), 1903 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), 1904 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), 1905 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), 1906 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) 1907 }; 1908 for (unsigned i = 0; i < 8; i++) { 1909 if (OperandIdx[i] < 0) 1910 return Node; 1911 SDValue &Src = Ops[OperandIdx[i] - 1]; 1912 SDValue &Neg = Ops[NegIdx[i] - 1]; 1913 SDValue &Abs = Ops[AbsIdx[i] - 1]; 1914 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 1915 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 1916 if (HasDst) 1917 SelIdx--; 1918 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 1919 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 1920 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 1921 } 1922 } else if (Opcode == AMDGPU::REG_SEQUENCE) { 1923 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 1924 SDValue &Src = Ops[i]; 1925 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 1926 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 1927 } 1928 } else if (Opcode == AMDGPU::CLAMP_R600) { 1929 SDValue Src = Node->getOperand(0); 1930 if (!Src.isMachineOpcode() || 1931 !TII->hasInstrModifiers(Src.getMachineOpcode())) 1932 return Node; 1933 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), 1934 AMDGPU::OpName::clamp); 1935 if (ClampIdx < 0) 1936 return Node; 1937 std::vector<SDValue> Ops; 1938 unsigned NumOp = Src.getNumOperands(); 1939 for(unsigned i = 0; i < NumOp; ++i) 1940 Ops.push_back(Src.getOperand(i)); 1941 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32); 1942 return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node), 1943 Node->getVTList(), Ops); 1944 } else { 1945 if (!TII->hasInstrModifiers(Opcode)) 1946 return Node; 1947 int OperandIdx[] = { 1948 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 1949 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 1950 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) 1951 }; 1952 int NegIdx[] = { 1953 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), 1954 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), 1955 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) 1956 }; 1957 int AbsIdx[] = { 1958 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), 1959 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), 1960 -1 1961 }; 1962 for (unsigned i = 0; i < 3; i++) { 1963 if (OperandIdx[i] < 0) 1964 return Node; 1965 SDValue &Src = Ops[OperandIdx[i] - 1]; 1966 SDValue &Neg = Ops[NegIdx[i] - 1]; 1967 SDValue FakeAbs; 1968 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 1969 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 1970 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 1971 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); 1972 if (HasDst) { 1973 SelIdx--; 1974 ImmIdx--; 1975 } 1976 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 1977 SDValue &Imm = Ops[ImmIdx]; 1978 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 1979 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 1980 } 1981 } 1982 1983 return Node; 1984} 1985