R600ISelLowering.cpp revision 29f1788de96cbf88ab87e3da130cf626b2e8e029
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for R600 12// 13//===----------------------------------------------------------------------===// 14 15#include "R600ISelLowering.h" 16#include "R600Defines.h" 17#include "R600InstrInfo.h" 18#include "R600MachineFunctionInfo.h" 19#include "llvm/CodeGen/CallingConvLower.h" 20#include "llvm/CodeGen/MachineFrameInfo.h" 21#include "llvm/CodeGen/MachineInstrBuilder.h" 22#include "llvm/CodeGen/MachineRegisterInfo.h" 23#include "llvm/CodeGen/SelectionDAG.h" 24#include "llvm/IR/Argument.h" 25#include "llvm/IR/Function.h" 26 27using namespace llvm; 28 29R600TargetLowering::R600TargetLowering(TargetMachine &TM) : 30 AMDGPUTargetLowering(TM), 31 Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) { 32 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 33 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 34 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 35 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 36 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); 37 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); 38 39 computeRegisterProperties(); 40 41 // Set condition code actions 42 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 43 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 44 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 45 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 46 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 47 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 48 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 49 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 50 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 51 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 52 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 53 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 54 55 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 56 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 57 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 58 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 59 60 setOperationAction(ISD::FCOS, MVT::f32, Custom); 61 setOperationAction(ISD::FSIN, MVT::f32, Custom); 62 63 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 64 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 65 66 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 67 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 68 69 setOperationAction(ISD::FSUB, MVT::f32, Expand); 70 71 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 72 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 73 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 74 75 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 76 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 77 78 setOperationAction(ISD::SETCC, MVT::i32, Expand); 79 setOperationAction(ISD::SETCC, MVT::f32, Expand); 80 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 81 82 setOperationAction(ISD::SELECT, MVT::i32, Expand); 83 setOperationAction(ISD::SELECT, MVT::f32, Expand); 84 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 85 setOperationAction(ISD::SELECT, MVT::v2f32, Expand); 86 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 87 setOperationAction(ISD::SELECT, MVT::v4f32, Expand); 88 89 // Legalize loads and stores to the private address space. 90 setOperationAction(ISD::LOAD, MVT::i32, Custom); 91 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 92 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 93 94 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 95 // spaces, so it is custom lowered to handle those where it isn't. 96 setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); 97 setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); 98 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 99 setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); 100 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 101 setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); 102 103 setOperationAction(ISD::STORE, MVT::i8, Custom); 104 setOperationAction(ISD::STORE, MVT::i32, Custom); 105 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 106 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 107 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 108 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 109 110 setOperationAction(ISD::LOAD, MVT::i32, Custom); 111 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 112 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 113 114 setTargetDAGCombine(ISD::FP_ROUND); 115 setTargetDAGCombine(ISD::FP_TO_SINT); 116 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 117 setTargetDAGCombine(ISD::SELECT_CC); 118 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 119 120 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 121 122 setBooleanContents(ZeroOrNegativeOneBooleanContent); 123 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 124 setSchedulingPreference(Sched::Source); 125} 126 127MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 128 MachineInstr * MI, MachineBasicBlock * BB) const { 129 MachineFunction * MF = BB->getParent(); 130 MachineRegisterInfo &MRI = MF->getRegInfo(); 131 MachineBasicBlock::iterator I = *MI; 132 const R600InstrInfo *TII = 133 static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo()); 134 135 switch (MI->getOpcode()) { 136 default: 137 if (TII->isLDSInstr(MI->getOpcode()) && 138 TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst) != -1) { 139 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); 140 assert(DstIdx != -1); 141 MachineInstrBuilder NewMI; 142 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) { 143 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()), 144 AMDGPU::OQAP); 145 TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV, 146 MI->getOperand(0).getReg(), 147 AMDGPU::OQAP); 148 } else { 149 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 150 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); 151 } 152 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { 153 NewMI.addOperand(MI->getOperand(i)); 154 } 155 } else { 156 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 157 } 158 break; 159 case AMDGPU::CLAMP_R600: { 160 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 161 AMDGPU::MOV, 162 MI->getOperand(0).getReg(), 163 MI->getOperand(1).getReg()); 164 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 165 break; 166 } 167 168 case AMDGPU::FABS_R600: { 169 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 170 AMDGPU::MOV, 171 MI->getOperand(0).getReg(), 172 MI->getOperand(1).getReg()); 173 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 174 break; 175 } 176 177 case AMDGPU::FNEG_R600: { 178 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 179 AMDGPU::MOV, 180 MI->getOperand(0).getReg(), 181 MI->getOperand(1).getReg()); 182 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 183 break; 184 } 185 186 case AMDGPU::MASK_WRITE: { 187 unsigned maskedRegister = MI->getOperand(0).getReg(); 188 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 189 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 190 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 191 break; 192 } 193 194 case AMDGPU::MOV_IMM_F32: 195 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 196 MI->getOperand(1).getFPImm()->getValueAPF() 197 .bitcastToAPInt().getZExtValue()); 198 break; 199 case AMDGPU::MOV_IMM_I32: 200 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 201 MI->getOperand(1).getImm()); 202 break; 203 case AMDGPU::CONST_COPY: { 204 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 205 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 206 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, 207 MI->getOperand(1).getImm()); 208 break; 209 } 210 211 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 212 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 213 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 214 unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 215 216 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 217 .addOperand(MI->getOperand(0)) 218 .addOperand(MI->getOperand(1)) 219 .addImm(EOP); // Set End of program bit 220 break; 221 } 222 223 case AMDGPU::TXD: { 224 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 225 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 226 MachineOperand &RID = MI->getOperand(4); 227 MachineOperand &SID = MI->getOperand(5); 228 unsigned TextureId = MI->getOperand(6).getImm(); 229 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 230 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 231 232 switch (TextureId) { 233 case 5: // Rect 234 CTX = CTY = 0; 235 break; 236 case 6: // Shadow1D 237 SrcW = SrcZ; 238 break; 239 case 7: // Shadow2D 240 SrcW = SrcZ; 241 break; 242 case 8: // ShadowRect 243 CTX = CTY = 0; 244 SrcW = SrcZ; 245 break; 246 case 9: // 1DArray 247 SrcZ = SrcY; 248 CTZ = 0; 249 break; 250 case 10: // 2DArray 251 CTZ = 0; 252 break; 253 case 11: // Shadow1DArray 254 SrcZ = SrcY; 255 CTZ = 0; 256 break; 257 case 12: // Shadow2DArray 258 CTZ = 0; 259 break; 260 } 261 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 262 .addOperand(MI->getOperand(3)) 263 .addImm(SrcX) 264 .addImm(SrcY) 265 .addImm(SrcZ) 266 .addImm(SrcW) 267 .addImm(0) 268 .addImm(0) 269 .addImm(0) 270 .addImm(0) 271 .addImm(1) 272 .addImm(2) 273 .addImm(3) 274 .addOperand(RID) 275 .addOperand(SID) 276 .addImm(CTX) 277 .addImm(CTY) 278 .addImm(CTZ) 279 .addImm(CTW); 280 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 281 .addOperand(MI->getOperand(2)) 282 .addImm(SrcX) 283 .addImm(SrcY) 284 .addImm(SrcZ) 285 .addImm(SrcW) 286 .addImm(0) 287 .addImm(0) 288 .addImm(0) 289 .addImm(0) 290 .addImm(1) 291 .addImm(2) 292 .addImm(3) 293 .addOperand(RID) 294 .addOperand(SID) 295 .addImm(CTX) 296 .addImm(CTY) 297 .addImm(CTZ) 298 .addImm(CTW); 299 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 300 .addOperand(MI->getOperand(0)) 301 .addOperand(MI->getOperand(1)) 302 .addImm(SrcX) 303 .addImm(SrcY) 304 .addImm(SrcZ) 305 .addImm(SrcW) 306 .addImm(0) 307 .addImm(0) 308 .addImm(0) 309 .addImm(0) 310 .addImm(1) 311 .addImm(2) 312 .addImm(3) 313 .addOperand(RID) 314 .addOperand(SID) 315 .addImm(CTX) 316 .addImm(CTY) 317 .addImm(CTZ) 318 .addImm(CTW) 319 .addReg(T0, RegState::Implicit) 320 .addReg(T1, RegState::Implicit); 321 break; 322 } 323 324 case AMDGPU::TXD_SHADOW: { 325 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 326 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 327 MachineOperand &RID = MI->getOperand(4); 328 MachineOperand &SID = MI->getOperand(5); 329 unsigned TextureId = MI->getOperand(6).getImm(); 330 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 331 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 332 333 switch (TextureId) { 334 case 5: // Rect 335 CTX = CTY = 0; 336 break; 337 case 6: // Shadow1D 338 SrcW = SrcZ; 339 break; 340 case 7: // Shadow2D 341 SrcW = SrcZ; 342 break; 343 case 8: // ShadowRect 344 CTX = CTY = 0; 345 SrcW = SrcZ; 346 break; 347 case 9: // 1DArray 348 SrcZ = SrcY; 349 CTZ = 0; 350 break; 351 case 10: // 2DArray 352 CTZ = 0; 353 break; 354 case 11: // Shadow1DArray 355 SrcZ = SrcY; 356 CTZ = 0; 357 break; 358 case 12: // Shadow2DArray 359 CTZ = 0; 360 break; 361 } 362 363 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 364 .addOperand(MI->getOperand(3)) 365 .addImm(SrcX) 366 .addImm(SrcY) 367 .addImm(SrcZ) 368 .addImm(SrcW) 369 .addImm(0) 370 .addImm(0) 371 .addImm(0) 372 .addImm(0) 373 .addImm(1) 374 .addImm(2) 375 .addImm(3) 376 .addOperand(RID) 377 .addOperand(SID) 378 .addImm(CTX) 379 .addImm(CTY) 380 .addImm(CTZ) 381 .addImm(CTW); 382 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 383 .addOperand(MI->getOperand(2)) 384 .addImm(SrcX) 385 .addImm(SrcY) 386 .addImm(SrcZ) 387 .addImm(SrcW) 388 .addImm(0) 389 .addImm(0) 390 .addImm(0) 391 .addImm(0) 392 .addImm(1) 393 .addImm(2) 394 .addImm(3) 395 .addOperand(RID) 396 .addOperand(SID) 397 .addImm(CTX) 398 .addImm(CTY) 399 .addImm(CTZ) 400 .addImm(CTW); 401 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 402 .addOperand(MI->getOperand(0)) 403 .addOperand(MI->getOperand(1)) 404 .addImm(SrcX) 405 .addImm(SrcY) 406 .addImm(SrcZ) 407 .addImm(SrcW) 408 .addImm(0) 409 .addImm(0) 410 .addImm(0) 411 .addImm(0) 412 .addImm(1) 413 .addImm(2) 414 .addImm(3) 415 .addOperand(RID) 416 .addOperand(SID) 417 .addImm(CTX) 418 .addImm(CTY) 419 .addImm(CTZ) 420 .addImm(CTW) 421 .addReg(T0, RegState::Implicit) 422 .addReg(T1, RegState::Implicit); 423 break; 424 } 425 426 case AMDGPU::BRANCH: 427 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 428 .addOperand(MI->getOperand(0)); 429 break; 430 431 case AMDGPU::BRANCH_COND_f32: { 432 MachineInstr *NewMI = 433 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 434 AMDGPU::PREDICATE_BIT) 435 .addOperand(MI->getOperand(1)) 436 .addImm(OPCODE_IS_NOT_ZERO) 437 .addImm(0); // Flags 438 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 439 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 440 .addOperand(MI->getOperand(0)) 441 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 442 break; 443 } 444 445 case AMDGPU::BRANCH_COND_i32: { 446 MachineInstr *NewMI = 447 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 448 AMDGPU::PREDICATE_BIT) 449 .addOperand(MI->getOperand(1)) 450 .addImm(OPCODE_IS_NOT_ZERO_INT) 451 .addImm(0); // Flags 452 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 453 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 454 .addOperand(MI->getOperand(0)) 455 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 456 break; 457 } 458 459 case AMDGPU::EG_ExportSwz: 460 case AMDGPU::R600_ExportSwz: { 461 // Instruction is left unmodified if its not the last one of its type 462 bool isLastInstructionOfItsType = true; 463 unsigned InstExportType = MI->getOperand(1).getImm(); 464 for (MachineBasicBlock::iterator NextExportInst = llvm::next(I), 465 EndBlock = BB->end(); NextExportInst != EndBlock; 466 NextExportInst = llvm::next(NextExportInst)) { 467 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 468 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 469 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 470 .getImm(); 471 if (CurrentInstExportType == InstExportType) { 472 isLastInstructionOfItsType = false; 473 break; 474 } 475 } 476 } 477 bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0; 478 if (!EOP && !isLastInstructionOfItsType) 479 return BB; 480 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 481 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 482 .addOperand(MI->getOperand(0)) 483 .addOperand(MI->getOperand(1)) 484 .addOperand(MI->getOperand(2)) 485 .addOperand(MI->getOperand(3)) 486 .addOperand(MI->getOperand(4)) 487 .addOperand(MI->getOperand(5)) 488 .addOperand(MI->getOperand(6)) 489 .addImm(CfInst) 490 .addImm(EOP); 491 break; 492 } 493 case AMDGPU::RETURN: { 494 // RETURN instructions must have the live-out registers as implicit uses, 495 // otherwise they appear dead. 496 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 497 MachineInstrBuilder MIB(*MF, MI); 498 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 499 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 500 return BB; 501 } 502 } 503 504 MI->eraseFromParent(); 505 return BB; 506} 507 508//===----------------------------------------------------------------------===// 509// Custom DAG Lowering Operations 510//===----------------------------------------------------------------------===// 511 512SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 513 MachineFunction &MF = DAG.getMachineFunction(); 514 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 515 switch (Op.getOpcode()) { 516 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 517 case ISD::FCOS: 518 case ISD::FSIN: return LowerTrig(Op, DAG); 519 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 520 case ISD::STORE: return LowerSTORE(Op, DAG); 521 case ISD::LOAD: return LowerLOAD(Op, DAG); 522 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 523 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 524 case ISD::INTRINSIC_VOID: { 525 SDValue Chain = Op.getOperand(0); 526 unsigned IntrinsicID = 527 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 528 switch (IntrinsicID) { 529 case AMDGPUIntrinsic::AMDGPU_store_output: { 530 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 531 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 532 MFI->LiveOuts.push_back(Reg); 533 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); 534 } 535 case AMDGPUIntrinsic::R600_store_swizzle: { 536 const SDValue Args[8] = { 537 Chain, 538 Op.getOperand(2), // Export Value 539 Op.getOperand(3), // ArrayBase 540 Op.getOperand(4), // Type 541 DAG.getConstant(0, MVT::i32), // SWZ_X 542 DAG.getConstant(1, MVT::i32), // SWZ_Y 543 DAG.getConstant(2, MVT::i32), // SWZ_Z 544 DAG.getConstant(3, MVT::i32) // SWZ_W 545 }; 546 return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), 547 Args, 8); 548 } 549 550 // default for switch(IntrinsicID) 551 default: break; 552 } 553 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 554 break; 555 } 556 case ISD::INTRINSIC_WO_CHAIN: { 557 unsigned IntrinsicID = 558 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 559 EVT VT = Op.getValueType(); 560 SDLoc DL(Op); 561 switch(IntrinsicID) { 562 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 563 case AMDGPUIntrinsic::R600_load_input: { 564 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 565 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 566 MachineFunction &MF = DAG.getMachineFunction(); 567 MachineRegisterInfo &MRI = MF.getRegInfo(); 568 MRI.addLiveIn(Reg); 569 return DAG.getCopyFromReg(DAG.getEntryNode(), 570 SDLoc(DAG.getEntryNode()), Reg, VT); 571 } 572 573 case AMDGPUIntrinsic::R600_interp_input: { 574 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 575 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 576 MachineSDNode *interp; 577 if (ijb < 0) { 578 const MachineFunction &MF = DAG.getMachineFunction(); 579 const R600InstrInfo *TII = 580 static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo()); 581 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 582 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); 583 return DAG.getTargetExtractSubreg( 584 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 585 DL, MVT::f32, SDValue(interp, 0)); 586 } 587 MachineFunction &MF = DAG.getMachineFunction(); 588 MachineRegisterInfo &MRI = MF.getRegInfo(); 589 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); 590 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); 591 MRI.addLiveIn(RegisterI); 592 MRI.addLiveIn(RegisterJ); 593 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), 594 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); 595 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), 596 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); 597 598 if (slot % 4 < 2) 599 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 600 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 601 RegisterJNode, RegisterINode); 602 else 603 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 604 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 605 RegisterJNode, RegisterINode); 606 return SDValue(interp, slot % 2); 607 } 608 case AMDGPUIntrinsic::R600_interp_xy: 609 case AMDGPUIntrinsic::R600_interp_zw: { 610 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 611 MachineSDNode *interp; 612 SDValue RegisterINode = Op.getOperand(2); 613 SDValue RegisterJNode = Op.getOperand(3); 614 615 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) 616 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 617 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 618 RegisterJNode, RegisterINode); 619 else 620 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 621 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 622 RegisterJNode, RegisterINode); 623 return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, 624 SDValue(interp, 0), SDValue(interp, 1)); 625 } 626 case AMDGPUIntrinsic::R600_tex: 627 case AMDGPUIntrinsic::R600_texc: 628 case AMDGPUIntrinsic::R600_txl: 629 case AMDGPUIntrinsic::R600_txlc: 630 case AMDGPUIntrinsic::R600_txb: 631 case AMDGPUIntrinsic::R600_txbc: 632 case AMDGPUIntrinsic::R600_txf: 633 case AMDGPUIntrinsic::R600_txq: 634 case AMDGPUIntrinsic::R600_ddx: 635 case AMDGPUIntrinsic::R600_ddy: 636 case AMDGPUIntrinsic::R600_ldptr: { 637 unsigned TextureOp; 638 switch (IntrinsicID) { 639 case AMDGPUIntrinsic::R600_tex: 640 TextureOp = 0; 641 break; 642 case AMDGPUIntrinsic::R600_texc: 643 TextureOp = 1; 644 break; 645 case AMDGPUIntrinsic::R600_txl: 646 TextureOp = 2; 647 break; 648 case AMDGPUIntrinsic::R600_txlc: 649 TextureOp = 3; 650 break; 651 case AMDGPUIntrinsic::R600_txb: 652 TextureOp = 4; 653 break; 654 case AMDGPUIntrinsic::R600_txbc: 655 TextureOp = 5; 656 break; 657 case AMDGPUIntrinsic::R600_txf: 658 TextureOp = 6; 659 break; 660 case AMDGPUIntrinsic::R600_txq: 661 TextureOp = 7; 662 break; 663 case AMDGPUIntrinsic::R600_ddx: 664 TextureOp = 8; 665 break; 666 case AMDGPUIntrinsic::R600_ddy: 667 TextureOp = 9; 668 break; 669 case AMDGPUIntrinsic::R600_ldptr: 670 TextureOp = 10; 671 break; 672 default: 673 llvm_unreachable("Unknow Texture Operation"); 674 } 675 676 SDValue TexArgs[19] = { 677 DAG.getConstant(TextureOp, MVT::i32), 678 Op.getOperand(1), 679 DAG.getConstant(0, MVT::i32), 680 DAG.getConstant(1, MVT::i32), 681 DAG.getConstant(2, MVT::i32), 682 DAG.getConstant(3, MVT::i32), 683 Op.getOperand(2), 684 Op.getOperand(3), 685 Op.getOperand(4), 686 DAG.getConstant(0, MVT::i32), 687 DAG.getConstant(1, MVT::i32), 688 DAG.getConstant(2, MVT::i32), 689 DAG.getConstant(3, MVT::i32), 690 Op.getOperand(5), 691 Op.getOperand(6), 692 Op.getOperand(7), 693 Op.getOperand(8), 694 Op.getOperand(9), 695 Op.getOperand(10) 696 }; 697 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19); 698 } 699 case AMDGPUIntrinsic::AMDGPU_dp4: { 700 SDValue Args[8] = { 701 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 702 DAG.getConstant(0, MVT::i32)), 703 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 704 DAG.getConstant(0, MVT::i32)), 705 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 706 DAG.getConstant(1, MVT::i32)), 707 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 708 DAG.getConstant(1, MVT::i32)), 709 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 710 DAG.getConstant(2, MVT::i32)), 711 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 712 DAG.getConstant(2, MVT::i32)), 713 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 714 DAG.getConstant(3, MVT::i32)), 715 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 716 DAG.getConstant(3, MVT::i32)) 717 }; 718 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8); 719 } 720 721 case Intrinsic::r600_read_ngroups_x: 722 return LowerImplicitParameter(DAG, VT, DL, 0); 723 case Intrinsic::r600_read_ngroups_y: 724 return LowerImplicitParameter(DAG, VT, DL, 1); 725 case Intrinsic::r600_read_ngroups_z: 726 return LowerImplicitParameter(DAG, VT, DL, 2); 727 case Intrinsic::r600_read_global_size_x: 728 return LowerImplicitParameter(DAG, VT, DL, 3); 729 case Intrinsic::r600_read_global_size_y: 730 return LowerImplicitParameter(DAG, VT, DL, 4); 731 case Intrinsic::r600_read_global_size_z: 732 return LowerImplicitParameter(DAG, VT, DL, 5); 733 case Intrinsic::r600_read_local_size_x: 734 return LowerImplicitParameter(DAG, VT, DL, 6); 735 case Intrinsic::r600_read_local_size_y: 736 return LowerImplicitParameter(DAG, VT, DL, 7); 737 case Intrinsic::r600_read_local_size_z: 738 return LowerImplicitParameter(DAG, VT, DL, 8); 739 740 case Intrinsic::r600_read_tgid_x: 741 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 742 AMDGPU::T1_X, VT); 743 case Intrinsic::r600_read_tgid_y: 744 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 745 AMDGPU::T1_Y, VT); 746 case Intrinsic::r600_read_tgid_z: 747 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 748 AMDGPU::T1_Z, VT); 749 case Intrinsic::r600_read_tidig_x: 750 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 751 AMDGPU::T0_X, VT); 752 case Intrinsic::r600_read_tidig_y: 753 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 754 AMDGPU::T0_Y, VT); 755 case Intrinsic::r600_read_tidig_z: 756 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 757 AMDGPU::T0_Z, VT); 758 } 759 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 760 break; 761 } 762 } // end switch(Op.getOpcode()) 763 return SDValue(); 764} 765 766void R600TargetLowering::ReplaceNodeResults(SDNode *N, 767 SmallVectorImpl<SDValue> &Results, 768 SelectionDAG &DAG) const { 769 switch (N->getOpcode()) { 770 default: return; 771 case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 772 return; 773 case ISD::LOAD: { 774 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); 775 Results.push_back(SDValue(Node, 0)); 776 Results.push_back(SDValue(Node, 1)); 777 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode 778 // function 779 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); 780 return; 781 } 782 case ISD::STORE: 783 SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); 784 Results.push_back(SDValue(Node, 0)); 785 return; 786 } 787} 788 789SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 790 // On hw >= R700, COS/SIN input must be between -1. and 1. 791 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 792 EVT VT = Op.getValueType(); 793 SDValue Arg = Op.getOperand(0); 794 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, 795 DAG.getNode(ISD::FADD, SDLoc(Op), VT, 796 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, 797 DAG.getConstantFP(0.15915494309, MVT::f32)), 798 DAG.getConstantFP(0.5, MVT::f32))); 799 unsigned TrigNode; 800 switch (Op.getOpcode()) { 801 case ISD::FCOS: 802 TrigNode = AMDGPUISD::COS_HW; 803 break; 804 case ISD::FSIN: 805 TrigNode = AMDGPUISD::SIN_HW; 806 break; 807 default: 808 llvm_unreachable("Wrong trig opcode"); 809 } 810 SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT, 811 DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart, 812 DAG.getConstantFP(-0.5, MVT::f32))); 813 if (Gen >= AMDGPUSubtarget::R700) 814 return TrigVal; 815 // On R600 hw, COS/SIN input must be between -Pi and Pi. 816 return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal, 817 DAG.getConstantFP(3.14159265359, MVT::f32)); 818} 819 820SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 821 return DAG.getNode( 822 ISD::SETCC, 823 SDLoc(Op), 824 MVT::i1, 825 Op, DAG.getConstantFP(0.0f, MVT::f32), 826 DAG.getCondCode(ISD::SETNE) 827 ); 828} 829 830SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 831 SDLoc DL, 832 unsigned DwordOffset) const { 833 unsigned ByteOffset = DwordOffset * 4; 834 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 835 AMDGPUAS::CONSTANT_BUFFER_0); 836 837 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 838 assert(isInt<16>(ByteOffset)); 839 840 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 841 DAG.getConstant(ByteOffset, MVT::i32), // PTR 842 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 843 false, false, false, 0); 844} 845 846SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 847 848 MachineFunction &MF = DAG.getMachineFunction(); 849 const AMDGPUFrameLowering *TFL = 850 static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); 851 852 FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); 853 assert(FIN); 854 855 unsigned FrameIndex = FIN->getIndex(); 856 unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); 857 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32); 858} 859 860bool R600TargetLowering::isZero(SDValue Op) const { 861 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 862 return Cst->isNullValue(); 863 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 864 return CstFP->isZero(); 865 } else { 866 return false; 867 } 868} 869 870SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 871 SDLoc DL(Op); 872 EVT VT = Op.getValueType(); 873 874 SDValue LHS = Op.getOperand(0); 875 SDValue RHS = Op.getOperand(1); 876 SDValue True = Op.getOperand(2); 877 SDValue False = Op.getOperand(3); 878 SDValue CC = Op.getOperand(4); 879 SDValue Temp; 880 881 // LHS and RHS are guaranteed to be the same value type 882 EVT CompareVT = LHS.getValueType(); 883 884 // Check if we can lower this to a native operation. 885 886 // Try to lower to a SET* instruction: 887 // 888 // SET* can match the following patterns: 889 // 890 // select_cc f32, f32, -1, 0, cc_supported 891 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 892 // select_cc i32, i32, -1, 0, cc_supported 893 // 894 895 // Move hardware True/False values to the correct operand. 896 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 897 ISD::CondCode InverseCC = 898 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 899 if (isHWTrueValue(False) && isHWFalseValue(True)) { 900 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 901 std::swap(False, True); 902 CC = DAG.getCondCode(InverseCC); 903 } else { 904 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 905 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 906 std::swap(False, True); 907 std::swap(LHS, RHS); 908 CC = DAG.getCondCode(SwapInvCC); 909 } 910 } 911 } 912 913 if (isHWTrueValue(True) && isHWFalseValue(False) && 914 (CompareVT == VT || VT == MVT::i32)) { 915 // This can be matched by a SET* instruction. 916 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 917 } 918 919 // Try to lower to a CND* instruction: 920 // 921 // CND* can match the following patterns: 922 // 923 // select_cc f32, 0.0, f32, f32, cc_supported 924 // select_cc f32, 0.0, i32, i32, cc_supported 925 // select_cc i32, 0, f32, f32, cc_supported 926 // select_cc i32, 0, i32, i32, cc_supported 927 // 928 929 // Try to move the zero value to the RHS 930 if (isZero(LHS)) { 931 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 932 // Try swapping the operands 933 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 934 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 935 std::swap(LHS, RHS); 936 CC = DAG.getCondCode(CCSwapped); 937 } else { 938 // Try inverting the conditon and then swapping the operands 939 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); 940 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 941 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 942 std::swap(True, False); 943 std::swap(LHS, RHS); 944 CC = DAG.getCondCode(CCSwapped); 945 } 946 } 947 } 948 if (isZero(RHS)) { 949 SDValue Cond = LHS; 950 SDValue Zero = RHS; 951 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 952 if (CompareVT != VT) { 953 // Bitcast True / False to the correct types. This will end up being 954 // a nop, but it allows us to define only a single pattern in the 955 // .TD files for each CND* instruction rather than having to have 956 // one pattern for integer True/False and one for fp True/False 957 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 958 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 959 } 960 961 switch (CCOpcode) { 962 case ISD::SETONE: 963 case ISD::SETUNE: 964 case ISD::SETNE: 965 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 966 Temp = True; 967 True = False; 968 False = Temp; 969 break; 970 default: 971 break; 972 } 973 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 974 Cond, Zero, 975 True, False, 976 DAG.getCondCode(CCOpcode)); 977 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 978 } 979 980 981 // Possible Min/Max pattern 982 SDValue MinMax = LowerMinMax(Op, DAG); 983 if (MinMax.getNode()) { 984 return MinMax; 985 } 986 987 // If we make it this for it means we have no native instructions to handle 988 // this SELECT_CC, so we must lower it. 989 SDValue HWTrue, HWFalse; 990 991 if (CompareVT == MVT::f32) { 992 HWTrue = DAG.getConstantFP(1.0f, CompareVT); 993 HWFalse = DAG.getConstantFP(0.0f, CompareVT); 994 } else if (CompareVT == MVT::i32) { 995 HWTrue = DAG.getConstant(-1, CompareVT); 996 HWFalse = DAG.getConstant(0, CompareVT); 997 } 998 else { 999 assert(!"Unhandled value type in LowerSELECT_CC"); 1000 } 1001 1002 // Lower this unsupported SELECT_CC into a combination of two supported 1003 // SELECT_CC operations. 1004 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 1005 1006 return DAG.getNode(ISD::SELECT_CC, DL, VT, 1007 Cond, HWFalse, 1008 True, False, 1009 DAG.getCondCode(ISD::SETNE)); 1010} 1011 1012/// LLVM generates byte-addresed pointers. For indirect addressing, we need to 1013/// convert these pointers to a register index. Each register holds 1014/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 1015/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 1016/// for indirect addressing. 1017SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 1018 unsigned StackWidth, 1019 SelectionDAG &DAG) const { 1020 unsigned SRLPad; 1021 switch(StackWidth) { 1022 case 1: 1023 SRLPad = 2; 1024 break; 1025 case 2: 1026 SRLPad = 3; 1027 break; 1028 case 4: 1029 SRLPad = 4; 1030 break; 1031 default: llvm_unreachable("Invalid stack width"); 1032 } 1033 1034 return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr, 1035 DAG.getConstant(SRLPad, MVT::i32)); 1036} 1037 1038void R600TargetLowering::getStackAddress(unsigned StackWidth, 1039 unsigned ElemIdx, 1040 unsigned &Channel, 1041 unsigned &PtrIncr) const { 1042 switch (StackWidth) { 1043 default: 1044 case 1: 1045 Channel = 0; 1046 if (ElemIdx > 0) { 1047 PtrIncr = 1; 1048 } else { 1049 PtrIncr = 0; 1050 } 1051 break; 1052 case 2: 1053 Channel = ElemIdx % 2; 1054 if (ElemIdx == 2) { 1055 PtrIncr = 1; 1056 } else { 1057 PtrIncr = 0; 1058 } 1059 break; 1060 case 4: 1061 Channel = ElemIdx; 1062 PtrIncr = 0; 1063 break; 1064 } 1065} 1066 1067SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1068 SDLoc DL(Op); 1069 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1070 SDValue Chain = Op.getOperand(0); 1071 SDValue Value = Op.getOperand(1); 1072 SDValue Ptr = Op.getOperand(2); 1073 1074 SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1075 if (Result.getNode()) { 1076 return Result; 1077 } 1078 1079 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { 1080 if (StoreNode->isTruncatingStore()) { 1081 EVT VT = Value.getValueType(); 1082 assert(VT.bitsLE(MVT::i32)); 1083 EVT MemVT = StoreNode->getMemoryVT(); 1084 SDValue MaskConstant; 1085 if (MemVT == MVT::i8) { 1086 MaskConstant = DAG.getConstant(0xFF, MVT::i32); 1087 } else { 1088 assert(MemVT == MVT::i16); 1089 MaskConstant = DAG.getConstant(0xFFFF, MVT::i32); 1090 } 1091 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, 1092 DAG.getConstant(2, MVT::i32)); 1093 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, 1094 DAG.getConstant(0x00000003, VT)); 1095 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1096 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1097 DAG.getConstant(3, VT)); 1098 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); 1099 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); 1100 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1101 // vector instead. 1102 SDValue Src[4] = { 1103 ShiftedValue, 1104 DAG.getConstant(0, MVT::i32), 1105 DAG.getConstant(0, MVT::i32), 1106 Mask 1107 }; 1108 SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4); 1109 SDValue Args[3] = { Chain, Input, DWordAddr }; 1110 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1111 Op->getVTList(), Args, 3, MemVT, 1112 StoreNode->getMemOperand()); 1113 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && 1114 Value.getValueType().bitsGE(MVT::i32)) { 1115 // Convert pointer from byte address to dword address. 1116 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 1117 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 1118 Ptr, DAG.getConstant(2, MVT::i32))); 1119 1120 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 1121 assert(!"Truncated and indexed stores not supported yet"); 1122 } else { 1123 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1124 } 1125 return Chain; 1126 } 1127 } 1128 1129 EVT ValueVT = Value.getValueType(); 1130 1131 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1132 return SDValue(); 1133 } 1134 1135 // Lowering for indirect addressing 1136 1137 const MachineFunction &MF = DAG.getMachineFunction(); 1138 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 1139 getTargetMachine().getFrameLowering()); 1140 unsigned StackWidth = TFL->getStackWidth(MF); 1141 1142 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1143 1144 if (ValueVT.isVector()) { 1145 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1146 EVT ElemVT = ValueVT.getVectorElementType(); 1147 SDValue Stores[4]; 1148 1149 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1150 "vector width in load"); 1151 1152 for (unsigned i = 0; i < NumElemVT; ++i) { 1153 unsigned Channel, PtrIncr; 1154 getStackAddress(StackWidth, i, Channel, PtrIncr); 1155 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1156 DAG.getConstant(PtrIncr, MVT::i32)); 1157 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1158 Value, DAG.getConstant(i, MVT::i32)); 1159 1160 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1161 Chain, Elem, Ptr, 1162 DAG.getTargetConstant(Channel, MVT::i32)); 1163 } 1164 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); 1165 } else { 1166 if (ValueVT == MVT::i8) { 1167 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1168 } 1169 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1170 DAG.getTargetConstant(0, MVT::i32)); // Channel 1171 } 1172 1173 return Chain; 1174} 1175 1176// return (512 + (kc_bank << 12) 1177static int 1178ConstantAddressBlock(unsigned AddressSpace) { 1179 switch (AddressSpace) { 1180 case AMDGPUAS::CONSTANT_BUFFER_0: 1181 return 512; 1182 case AMDGPUAS::CONSTANT_BUFFER_1: 1183 return 512 + 4096; 1184 case AMDGPUAS::CONSTANT_BUFFER_2: 1185 return 512 + 4096 * 2; 1186 case AMDGPUAS::CONSTANT_BUFFER_3: 1187 return 512 + 4096 * 3; 1188 case AMDGPUAS::CONSTANT_BUFFER_4: 1189 return 512 + 4096 * 4; 1190 case AMDGPUAS::CONSTANT_BUFFER_5: 1191 return 512 + 4096 * 5; 1192 case AMDGPUAS::CONSTANT_BUFFER_6: 1193 return 512 + 4096 * 6; 1194 case AMDGPUAS::CONSTANT_BUFFER_7: 1195 return 512 + 4096 * 7; 1196 case AMDGPUAS::CONSTANT_BUFFER_8: 1197 return 512 + 4096 * 8; 1198 case AMDGPUAS::CONSTANT_BUFFER_9: 1199 return 512 + 4096 * 9; 1200 case AMDGPUAS::CONSTANT_BUFFER_10: 1201 return 512 + 4096 * 10; 1202 case AMDGPUAS::CONSTANT_BUFFER_11: 1203 return 512 + 4096 * 11; 1204 case AMDGPUAS::CONSTANT_BUFFER_12: 1205 return 512 + 4096 * 12; 1206 case AMDGPUAS::CONSTANT_BUFFER_13: 1207 return 512 + 4096 * 13; 1208 case AMDGPUAS::CONSTANT_BUFFER_14: 1209 return 512 + 4096 * 14; 1210 case AMDGPUAS::CONSTANT_BUFFER_15: 1211 return 512 + 4096 * 15; 1212 default: 1213 return -1; 1214 } 1215} 1216 1217SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 1218{ 1219 EVT VT = Op.getValueType(); 1220 SDLoc DL(Op); 1221 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1222 SDValue Chain = Op.getOperand(0); 1223 SDValue Ptr = Op.getOperand(1); 1224 SDValue LoweredLoad; 1225 1226 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { 1227 SDValue MergedValues[2] = { 1228 SplitVectorLoad(Op, DAG), 1229 Chain 1230 }; 1231 return DAG.getMergeValues(MergedValues, 2, DL); 1232 } 1233 1234 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1235 if (ConstantBlock > -1 && 1236 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1237 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1238 SDValue Result; 1239 if (isa<ConstantExpr>(LoadNode->getSrcValue()) || 1240 isa<Constant>(LoadNode->getSrcValue()) || 1241 isa<ConstantSDNode>(Ptr)) { 1242 SDValue Slots[4]; 1243 for (unsigned i = 0; i < 4; i++) { 1244 // We want Const position encoded with the following formula : 1245 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1246 // const_index is Ptr computed by llvm using an alignment of 16. 1247 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1248 // then div by 4 at the ISel step 1249 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1250 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); 1251 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1252 } 1253 EVT NewVT = MVT::v4i32; 1254 unsigned NumElements = 4; 1255 if (VT.isVector()) { 1256 NewVT = VT; 1257 NumElements = VT.getVectorNumElements(); 1258 } 1259 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements); 1260 } else { 1261 // non constant ptr cant be folded, keeps it as a v4f32 load 1262 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1263 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), 1264 DAG.getConstant(LoadNode->getAddressSpace() - 1265 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) 1266 ); 1267 } 1268 1269 if (!VT.isVector()) { 1270 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1271 DAG.getConstant(0, MVT::i32)); 1272 } 1273 1274 SDValue MergedValues[2] = { 1275 Result, 1276 Chain 1277 }; 1278 return DAG.getMergeValues(MergedValues, 2, DL); 1279 } 1280 1281 // For most operations returning SDValue() will result in the node being 1282 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1283 // need to manually expand loads that may be legal in some address spaces and 1284 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1285 // compute shaders, since the data is sign extended when it is uploaded to the 1286 // buffer. However SEXT loads from other address spaces are not supported, so 1287 // we need to expand them here. 1288 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1289 EVT MemVT = LoadNode->getMemoryVT(); 1290 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1291 SDValue ShiftAmount = 1292 DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32); 1293 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, 1294 LoadNode->getPointerInfo(), MemVT, 1295 LoadNode->isVolatile(), 1296 LoadNode->isNonTemporal(), 1297 LoadNode->getAlignment()); 1298 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount); 1299 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount); 1300 1301 SDValue MergedValues[2] = { Sra, Chain }; 1302 return DAG.getMergeValues(MergedValues, 2, DL); 1303 } 1304 1305 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1306 return SDValue(); 1307 } 1308 1309 // Lowering for indirect addressing 1310 const MachineFunction &MF = DAG.getMachineFunction(); 1311 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 1312 getTargetMachine().getFrameLowering()); 1313 unsigned StackWidth = TFL->getStackWidth(MF); 1314 1315 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1316 1317 if (VT.isVector()) { 1318 unsigned NumElemVT = VT.getVectorNumElements(); 1319 EVT ElemVT = VT.getVectorElementType(); 1320 SDValue Loads[4]; 1321 1322 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1323 "vector width in load"); 1324 1325 for (unsigned i = 0; i < NumElemVT; ++i) { 1326 unsigned Channel, PtrIncr; 1327 getStackAddress(StackWidth, i, Channel, PtrIncr); 1328 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1329 DAG.getConstant(PtrIncr, MVT::i32)); 1330 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1331 Chain, Ptr, 1332 DAG.getTargetConstant(Channel, MVT::i32), 1333 Op.getOperand(2)); 1334 } 1335 for (unsigned i = NumElemVT; i < 4; ++i) { 1336 Loads[i] = DAG.getUNDEF(ElemVT); 1337 } 1338 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 1339 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); 1340 } else { 1341 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1342 Chain, Ptr, 1343 DAG.getTargetConstant(0, MVT::i32), // Channel 1344 Op.getOperand(2)); 1345 } 1346 1347 SDValue Ops[2]; 1348 Ops[0] = LoweredLoad; 1349 Ops[1] = Chain; 1350 1351 return DAG.getMergeValues(Ops, 2, DL); 1352} 1353 1354/// XXX Only kernel functions are supported, so we can assume for now that 1355/// every function is a kernel function, but in the future we should use 1356/// separate calling conventions for kernel and non-kernel functions. 1357SDValue R600TargetLowering::LowerFormalArguments( 1358 SDValue Chain, 1359 CallingConv::ID CallConv, 1360 bool isVarArg, 1361 const SmallVectorImpl<ISD::InputArg> &Ins, 1362 SDLoc DL, SelectionDAG &DAG, 1363 SmallVectorImpl<SDValue> &InVals) const { 1364 SmallVector<CCValAssign, 16> ArgLocs; 1365 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1366 getTargetMachine(), ArgLocs, *DAG.getContext()); 1367 MachineFunction &MF = DAG.getMachineFunction(); 1368 unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType; 1369 1370 SmallVector<ISD::InputArg, 8> LocalIns; 1371 1372 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 1373 LocalIns); 1374 1375 AnalyzeFormalArguments(CCInfo, LocalIns); 1376 1377 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1378 CCValAssign &VA = ArgLocs[i]; 1379 EVT VT = Ins[i].VT; 1380 EVT MemVT = LocalIns[i].VT; 1381 1382 if (ShaderType != ShaderType::COMPUTE) { 1383 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); 1384 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1385 InVals.push_back(Register); 1386 continue; 1387 } 1388 1389 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1390 AMDGPUAS::CONSTANT_BUFFER_0); 1391 1392 // The first 36 bytes of the input buffer contains information about 1393 // thread group and global sizes. 1394 SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, 1395 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32), 1396 MachinePointerInfo(UndefValue::get(PtrTy)), 1397 MemVT, false, false, 4); 1398 // 4 is the prefered alignment for 1399 // the CONSTANT memory space. 1400 InVals.push_back(Arg); 1401 } 1402 return Chain; 1403} 1404 1405EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1406 if (!VT.isVector()) return MVT::i32; 1407 return VT.changeVectorElementTypeToInteger(); 1408} 1409 1410static SDValue 1411CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry, 1412 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1413 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1414 assert(RemapSwizzle.empty()); 1415 SDValue NewBldVec[4] = { 1416 VectorEntry.getOperand(0), 1417 VectorEntry.getOperand(1), 1418 VectorEntry.getOperand(2), 1419 VectorEntry.getOperand(3) 1420 }; 1421 1422 for (unsigned i = 0; i < 4; i++) { 1423 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1424 // We mask write here to teach later passes that the ith element of this 1425 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1426 // break false dependencies and additionnaly make assembly easier to read. 1427 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1428 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1429 if (C->isZero()) { 1430 RemapSwizzle[i] = 4; // SEL_0 1431 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1432 } else if (C->isExactlyValue(1.0)) { 1433 RemapSwizzle[i] = 5; // SEL_1 1434 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1435 } 1436 } 1437 1438 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1439 continue; 1440 for (unsigned j = 0; j < i; j++) { 1441 if (NewBldVec[i] == NewBldVec[j]) { 1442 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1443 RemapSwizzle[i] = j; 1444 break; 1445 } 1446 } 1447 } 1448 1449 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1450 VectorEntry.getValueType(), NewBldVec, 4); 1451} 1452 1453static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1454 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1455 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1456 assert(RemapSwizzle.empty()); 1457 SDValue NewBldVec[4] = { 1458 VectorEntry.getOperand(0), 1459 VectorEntry.getOperand(1), 1460 VectorEntry.getOperand(2), 1461 VectorEntry.getOperand(3) 1462 }; 1463 bool isUnmovable[4] = { false, false, false, false }; 1464 for (unsigned i = 0; i < 4; i++) 1465 RemapSwizzle[i] = i; 1466 1467 for (unsigned i = 0; i < 4; i++) { 1468 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1469 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1470 ->getZExtValue(); 1471 if (i == Idx) { 1472 isUnmovable[Idx] = true; 1473 continue; 1474 } 1475 if (isUnmovable[Idx]) 1476 continue; 1477 // Swap i and Idx 1478 std::swap(NewBldVec[Idx], NewBldVec[i]); 1479 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1480 break; 1481 } 1482 } 1483 1484 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1485 VectorEntry.getValueType(), NewBldVec, 4); 1486} 1487 1488 1489SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, 1490SDValue Swz[4], SelectionDAG &DAG) const { 1491 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 1492 // Old -> New swizzle values 1493 DenseMap<unsigned, unsigned> SwizzleRemap; 1494 1495 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1496 for (unsigned i = 0; i < 4; i++) { 1497 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1498 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1499 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1500 } 1501 1502 SwizzleRemap.clear(); 1503 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1504 for (unsigned i = 0; i < 4; i++) { 1505 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1506 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1507 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1508 } 1509 1510 return BuildVector; 1511} 1512 1513 1514//===----------------------------------------------------------------------===// 1515// Custom DAG Optimizations 1516//===----------------------------------------------------------------------===// 1517 1518SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1519 DAGCombinerInfo &DCI) const { 1520 SelectionDAG &DAG = DCI.DAG; 1521 1522 switch (N->getOpcode()) { 1523 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1524 case ISD::FP_ROUND: { 1525 SDValue Arg = N->getOperand(0); 1526 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1527 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1528 Arg.getOperand(0)); 1529 } 1530 break; 1531 } 1532 1533 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1534 // (i32 select_cc f32, f32, -1, 0 cc) 1535 // 1536 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1537 // this to one of the SET*_DX10 instructions. 1538 case ISD::FP_TO_SINT: { 1539 SDValue FNeg = N->getOperand(0); 1540 if (FNeg.getOpcode() != ISD::FNEG) { 1541 return SDValue(); 1542 } 1543 SDValue SelectCC = FNeg.getOperand(0); 1544 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1545 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1546 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1547 !isHWTrueValue(SelectCC.getOperand(2)) || 1548 !isHWFalseValue(SelectCC.getOperand(3))) { 1549 return SDValue(); 1550 } 1551 1552 return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), 1553 SelectCC.getOperand(0), // LHS 1554 SelectCC.getOperand(1), // RHS 1555 DAG.getConstant(-1, MVT::i32), // True 1556 DAG.getConstant(0, MVT::i32), // Flase 1557 SelectCC.getOperand(4)); // CC 1558 1559 break; 1560 } 1561 1562 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1563 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1564 case ISD::INSERT_VECTOR_ELT: { 1565 SDValue InVec = N->getOperand(0); 1566 SDValue InVal = N->getOperand(1); 1567 SDValue EltNo = N->getOperand(2); 1568 SDLoc dl(N); 1569 1570 // If the inserted element is an UNDEF, just use the input vector. 1571 if (InVal.getOpcode() == ISD::UNDEF) 1572 return InVec; 1573 1574 EVT VT = InVec.getValueType(); 1575 1576 // If we can't generate a legal BUILD_VECTOR, exit 1577 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 1578 return SDValue(); 1579 1580 // Check that we know which element is being inserted 1581 if (!isa<ConstantSDNode>(EltNo)) 1582 return SDValue(); 1583 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 1584 1585 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 1586 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 1587 // vector elements. 1588 SmallVector<SDValue, 8> Ops; 1589 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 1590 Ops.append(InVec.getNode()->op_begin(), 1591 InVec.getNode()->op_end()); 1592 } else if (InVec.getOpcode() == ISD::UNDEF) { 1593 unsigned NElts = VT.getVectorNumElements(); 1594 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 1595 } else { 1596 return SDValue(); 1597 } 1598 1599 // Insert the element 1600 if (Elt < Ops.size()) { 1601 // All the operands of BUILD_VECTOR must have the same type; 1602 // we enforce that here. 1603 EVT OpVT = Ops[0].getValueType(); 1604 if (InVal.getValueType() != OpVT) 1605 InVal = OpVT.bitsGT(InVal.getValueType()) ? 1606 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : 1607 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); 1608 Ops[Elt] = InVal; 1609 } 1610 1611 // Return the new vector 1612 return DAG.getNode(ISD::BUILD_VECTOR, dl, 1613 VT, &Ops[0], Ops.size()); 1614 } 1615 1616 // Extract_vec (Build_vector) generated by custom lowering 1617 // also needs to be customly combined 1618 case ISD::EXTRACT_VECTOR_ELT: { 1619 SDValue Arg = N->getOperand(0); 1620 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1621 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1622 unsigned Element = Const->getZExtValue(); 1623 return Arg->getOperand(Element); 1624 } 1625 } 1626 if (Arg.getOpcode() == ISD::BITCAST && 1627 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 1628 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1629 unsigned Element = Const->getZExtValue(); 1630 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 1631 Arg->getOperand(0).getOperand(Element)); 1632 } 1633 } 1634 } 1635 1636 case ISD::SELECT_CC: { 1637 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1638 // selectcc x, y, a, b, inv(cc) 1639 // 1640 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1641 // selectcc x, y, a, b, cc 1642 SDValue LHS = N->getOperand(0); 1643 if (LHS.getOpcode() != ISD::SELECT_CC) { 1644 return SDValue(); 1645 } 1646 1647 SDValue RHS = N->getOperand(1); 1648 SDValue True = N->getOperand(2); 1649 SDValue False = N->getOperand(3); 1650 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1651 1652 if (LHS.getOperand(2).getNode() != True.getNode() || 1653 LHS.getOperand(3).getNode() != False.getNode() || 1654 RHS.getNode() != False.getNode()) { 1655 return SDValue(); 1656 } 1657 1658 switch (NCC) { 1659 default: return SDValue(); 1660 case ISD::SETNE: return LHS; 1661 case ISD::SETEQ: { 1662 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1663 LHSCC = ISD::getSetCCInverse(LHSCC, 1664 LHS.getOperand(0).getValueType().isInteger()); 1665 if (DCI.isBeforeLegalizeOps() || 1666 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 1667 return DAG.getSelectCC(SDLoc(N), 1668 LHS.getOperand(0), 1669 LHS.getOperand(1), 1670 LHS.getOperand(2), 1671 LHS.getOperand(3), 1672 LHSCC); 1673 break; 1674 } 1675 } 1676 return SDValue(); 1677 } 1678 1679 case AMDGPUISD::EXPORT: { 1680 SDValue Arg = N->getOperand(1); 1681 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1682 break; 1683 1684 SDValue NewArgs[8] = { 1685 N->getOperand(0), // Chain 1686 SDValue(), 1687 N->getOperand(2), // ArrayBase 1688 N->getOperand(3), // Type 1689 N->getOperand(4), // SWZ_X 1690 N->getOperand(5), // SWZ_Y 1691 N->getOperand(6), // SWZ_Z 1692 N->getOperand(7) // SWZ_W 1693 }; 1694 SDLoc DL(N); 1695 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG); 1696 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8); 1697 } 1698 case AMDGPUISD::TEXTURE_FETCH: { 1699 SDValue Arg = N->getOperand(1); 1700 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1701 break; 1702 1703 SDValue NewArgs[19] = { 1704 N->getOperand(0), 1705 N->getOperand(1), 1706 N->getOperand(2), 1707 N->getOperand(3), 1708 N->getOperand(4), 1709 N->getOperand(5), 1710 N->getOperand(6), 1711 N->getOperand(7), 1712 N->getOperand(8), 1713 N->getOperand(9), 1714 N->getOperand(10), 1715 N->getOperand(11), 1716 N->getOperand(12), 1717 N->getOperand(13), 1718 N->getOperand(14), 1719 N->getOperand(15), 1720 N->getOperand(16), 1721 N->getOperand(17), 1722 N->getOperand(18), 1723 }; 1724 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG); 1725 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(), 1726 NewArgs, 19); 1727 } 1728 } 1729 return SDValue(); 1730} 1731 1732static bool 1733FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, 1734 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { 1735 const R600InstrInfo *TII = 1736 static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); 1737 if (!Src.isMachineOpcode()) 1738 return false; 1739 switch (Src.getMachineOpcode()) { 1740 case AMDGPU::FNEG_R600: 1741 if (!Neg.getNode()) 1742 return false; 1743 Src = Src.getOperand(0); 1744 Neg = DAG.getTargetConstant(1, MVT::i32); 1745 return true; 1746 case AMDGPU::FABS_R600: 1747 if (!Abs.getNode()) 1748 return false; 1749 Src = Src.getOperand(0); 1750 Abs = DAG.getTargetConstant(1, MVT::i32); 1751 return true; 1752 case AMDGPU::CONST_COPY: { 1753 unsigned Opcode = ParentNode->getMachineOpcode(); 1754 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 1755 1756 if (!Sel.getNode()) 1757 return false; 1758 1759 SDValue CstOffset = Src.getOperand(0); 1760 if (ParentNode->getValueType(0).isVector()) 1761 return false; 1762 1763 // Gather constants values 1764 int SrcIndices[] = { 1765 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 1766 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 1767 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), 1768 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 1769 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 1770 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 1771 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 1772 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 1773 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 1774 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 1775 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 1776 }; 1777 std::vector<unsigned> Consts; 1778 for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) { 1779 int OtherSrcIdx = SrcIndices[i]; 1780 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 1781 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 1782 continue; 1783 if (HasDst) { 1784 OtherSrcIdx--; 1785 OtherSelIdx--; 1786 } 1787 if (RegisterSDNode *Reg = 1788 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 1789 if (Reg->getReg() == AMDGPU::ALU_CONST) { 1790 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>( 1791 ParentNode->getOperand(OtherSelIdx)); 1792 Consts.push_back(Cst->getZExtValue()); 1793 } 1794 } 1795 } 1796 1797 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset); 1798 Consts.push_back(Cst->getZExtValue()); 1799 if (!TII->fitsConstReadLimitations(Consts)) { 1800 return false; 1801 } 1802 1803 Sel = CstOffset; 1804 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); 1805 return true; 1806 } 1807 case AMDGPU::MOV_IMM_I32: 1808 case AMDGPU::MOV_IMM_F32: { 1809 unsigned ImmReg = AMDGPU::ALU_LITERAL_X; 1810 uint64_t ImmValue = 0; 1811 1812 1813 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { 1814 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 1815 float FloatValue = FPC->getValueAPF().convertToFloat(); 1816 if (FloatValue == 0.0) { 1817 ImmReg = AMDGPU::ZERO; 1818 } else if (FloatValue == 0.5) { 1819 ImmReg = AMDGPU::HALF; 1820 } else if (FloatValue == 1.0) { 1821 ImmReg = AMDGPU::ONE; 1822 } else { 1823 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 1824 } 1825 } else { 1826 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); 1827 uint64_t Value = C->getZExtValue(); 1828 if (Value == 0) { 1829 ImmReg = AMDGPU::ZERO; 1830 } else if (Value == 1) { 1831 ImmReg = AMDGPU::ONE_INT; 1832 } else { 1833 ImmValue = Value; 1834 } 1835 } 1836 1837 // Check that we aren't already using an immediate. 1838 // XXX: It's possible for an instruction to have more than one 1839 // immediate operand, but this is not supported yet. 1840 if (ImmReg == AMDGPU::ALU_LITERAL_X) { 1841 if (!Imm.getNode()) 1842 return false; 1843 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); 1844 assert(C); 1845 if (C->getZExtValue()) 1846 return false; 1847 Imm = DAG.getTargetConstant(ImmValue, MVT::i32); 1848 } 1849 Src = DAG.getRegister(ImmReg, MVT::i32); 1850 return true; 1851 } 1852 default: 1853 return false; 1854 } 1855} 1856 1857 1858/// \brief Fold the instructions after selecting them 1859SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 1860 SelectionDAG &DAG) const { 1861 const R600InstrInfo *TII = 1862 static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); 1863 if (!Node->isMachineOpcode()) 1864 return Node; 1865 unsigned Opcode = Node->getMachineOpcode(); 1866 SDValue FakeOp; 1867 1868 std::vector<SDValue> Ops; 1869 for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end(); 1870 I != E; ++I) 1871 Ops.push_back(*I); 1872 1873 if (Opcode == AMDGPU::DOT_4) { 1874 int OperandIdx[] = { 1875 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 1876 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 1877 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 1878 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 1879 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 1880 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 1881 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 1882 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 1883 }; 1884 int NegIdx[] = { 1885 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), 1886 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), 1887 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), 1888 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), 1889 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), 1890 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), 1891 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), 1892 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) 1893 }; 1894 int AbsIdx[] = { 1895 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), 1896 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), 1897 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), 1898 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), 1899 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), 1900 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), 1901 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), 1902 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) 1903 }; 1904 for (unsigned i = 0; i < 8; i++) { 1905 if (OperandIdx[i] < 0) 1906 return Node; 1907 SDValue &Src = Ops[OperandIdx[i] - 1]; 1908 SDValue &Neg = Ops[NegIdx[i] - 1]; 1909 SDValue &Abs = Ops[AbsIdx[i] - 1]; 1910 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 1911 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 1912 if (HasDst) 1913 SelIdx--; 1914 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 1915 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 1916 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 1917 } 1918 } else if (Opcode == AMDGPU::REG_SEQUENCE) { 1919 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 1920 SDValue &Src = Ops[i]; 1921 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 1922 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 1923 } 1924 } else if (Opcode == AMDGPU::CLAMP_R600) { 1925 SDValue Src = Node->getOperand(0); 1926 if (!Src.isMachineOpcode() || 1927 !TII->hasInstrModifiers(Src.getMachineOpcode())) 1928 return Node; 1929 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), 1930 AMDGPU::OpName::clamp); 1931 if (ClampIdx < 0) 1932 return Node; 1933 std::vector<SDValue> Ops; 1934 unsigned NumOp = Src.getNumOperands(); 1935 for(unsigned i = 0; i < NumOp; ++i) 1936 Ops.push_back(Src.getOperand(i)); 1937 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32); 1938 return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node), 1939 Node->getVTList(), Ops); 1940 } else { 1941 if (!TII->hasInstrModifiers(Opcode)) 1942 return Node; 1943 int OperandIdx[] = { 1944 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 1945 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 1946 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) 1947 }; 1948 int NegIdx[] = { 1949 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), 1950 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), 1951 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) 1952 }; 1953 int AbsIdx[] = { 1954 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), 1955 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), 1956 -1 1957 }; 1958 for (unsigned i = 0; i < 3; i++) { 1959 if (OperandIdx[i] < 0) 1960 return Node; 1961 SDValue &Src = Ops[OperandIdx[i] - 1]; 1962 SDValue &Neg = Ops[NegIdx[i] - 1]; 1963 SDValue FakeAbs; 1964 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 1965 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 1966 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 1967 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); 1968 if (HasDst) { 1969 SelIdx--; 1970 ImmIdx--; 1971 } 1972 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 1973 SDValue &Imm = Ops[ImmIdx]; 1974 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 1975 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 1976 } 1977 } 1978 1979 return Node; 1980} 1981