R600ISelLowering.cpp revision 0962e147a439785279c3665379189017e980e0cc
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for R600 12// 13//===----------------------------------------------------------------------===// 14 15#include "R600ISelLowering.h" 16#include "R600Defines.h" 17#include "R600InstrInfo.h" 18#include "R600MachineFunctionInfo.h" 19#include "llvm/CodeGen/MachineFrameInfo.h" 20#include "llvm/CodeGen/MachineInstrBuilder.h" 21#include "llvm/CodeGen/MachineRegisterInfo.h" 22#include "llvm/CodeGen/SelectionDAG.h" 23#include "llvm/IR/Argument.h" 24#include "llvm/IR/Function.h" 25 26using namespace llvm; 27 28R600TargetLowering::R600TargetLowering(TargetMachine &TM) : 29 AMDGPUTargetLowering(TM), 30 TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) { 31 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 32 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 33 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 34 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 35 computeRegisterProperties(); 36 37 setOperationAction(ISD::FADD, MVT::v4f32, Expand); 38 setOperationAction(ISD::FMUL, MVT::v4f32, Expand); 39 setOperationAction(ISD::FDIV, MVT::v4f32, Expand); 40 setOperationAction(ISD::FSUB, MVT::v4f32, Expand); 41 42 setOperationAction(ISD::ADD, MVT::v4i32, Expand); 43 setOperationAction(ISD::AND, MVT::v4i32, Expand); 44 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); 45 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); 46 setOperationAction(ISD::MUL, MVT::v2i32, Expand); 47 setOperationAction(ISD::MUL, MVT::v4i32, Expand); 48 setOperationAction(ISD::OR, MVT::v4i32, Expand); 49 setOperationAction(ISD::OR, MVT::v2i32, Expand); 50 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); 51 setOperationAction(ISD::SHL, MVT::v4i32, Expand); 52 setOperationAction(ISD::SHL, MVT::v2i32, Expand); 53 setOperationAction(ISD::SRL, MVT::v4i32, Expand); 54 setOperationAction(ISD::SRL, MVT::v2i32, Expand); 55 setOperationAction(ISD::SRA, MVT::v4i32, Expand); 56 setOperationAction(ISD::SRA, MVT::v2i32, Expand); 57 setOperationAction(ISD::SUB, MVT::v4i32, Expand); 58 setOperationAction(ISD::SUB, MVT::v2i32, Expand); 59 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); 60 setOperationAction(ISD::UDIV, MVT::v4i32, Expand); 61 setOperationAction(ISD::UREM, MVT::v4i32, Expand); 62 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 63 setOperationAction(ISD::XOR, MVT::v4i32, Expand); 64 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 65 66 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 67 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 68 69 setOperationAction(ISD::FSUB, MVT::f32, Expand); 70 71 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 72 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 73 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 74 75 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 76 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 77 78 setOperationAction(ISD::SETCC, MVT::i32, Expand); 79 setOperationAction(ISD::SETCC, MVT::f32, Expand); 80 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 81 82 setOperationAction(ISD::SELECT, MVT::i32, Custom); 83 setOperationAction(ISD::SELECT, MVT::f32, Custom); 84 85 setOperationAction(ISD::VSELECT, MVT::v4i32, Expand); 86 setOperationAction(ISD::VSELECT, MVT::v2i32, Expand); 87 88 // Legalize loads and stores to the private address space. 89 setOperationAction(ISD::LOAD, MVT::i32, Custom); 90 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 91 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 92 setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); 93 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 94 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 95 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom); 96 setOperationAction(ISD::STORE, MVT::i8, Custom); 97 setOperationAction(ISD::STORE, MVT::i32, Custom); 98 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 99 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 100 101 setOperationAction(ISD::LOAD, MVT::i32, Custom); 102 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 103 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 104 105 setTargetDAGCombine(ISD::FP_ROUND); 106 setTargetDAGCombine(ISD::FP_TO_SINT); 107 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 108 setTargetDAGCombine(ISD::SELECT_CC); 109 110 setBooleanContents(ZeroOrNegativeOneBooleanContent); 111 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 112 setSchedulingPreference(Sched::VLIW); 113} 114 115MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 116 MachineInstr * MI, MachineBasicBlock * BB) const { 117 MachineFunction * MF = BB->getParent(); 118 MachineRegisterInfo &MRI = MF->getRegInfo(); 119 MachineBasicBlock::iterator I = *MI; 120 121 switch (MI->getOpcode()) { 122 default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 123 case AMDGPU::CLAMP_R600: { 124 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 125 AMDGPU::MOV, 126 MI->getOperand(0).getReg(), 127 MI->getOperand(1).getReg()); 128 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 129 break; 130 } 131 132 case AMDGPU::FABS_R600: { 133 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 134 AMDGPU::MOV, 135 MI->getOperand(0).getReg(), 136 MI->getOperand(1).getReg()); 137 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 138 break; 139 } 140 141 case AMDGPU::FNEG_R600: { 142 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 143 AMDGPU::MOV, 144 MI->getOperand(0).getReg(), 145 MI->getOperand(1).getReg()); 146 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 147 break; 148 } 149 150 case AMDGPU::MASK_WRITE: { 151 unsigned maskedRegister = MI->getOperand(0).getReg(); 152 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 153 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 154 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 155 break; 156 } 157 158 case AMDGPU::MOV_IMM_F32: 159 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 160 MI->getOperand(1).getFPImm()->getValueAPF() 161 .bitcastToAPInt().getZExtValue()); 162 break; 163 case AMDGPU::MOV_IMM_I32: 164 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 165 MI->getOperand(1).getImm()); 166 break; 167 case AMDGPU::CONST_COPY: { 168 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 169 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 170 TII->setImmOperand(NewMI, R600Operands::SRC0_SEL, 171 MI->getOperand(1).getImm()); 172 break; 173 } 174 175 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 176 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 177 unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 178 179 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 180 .addOperand(MI->getOperand(0)) 181 .addOperand(MI->getOperand(1)) 182 .addImm(EOP); // Set End of program bit 183 break; 184 } 185 186 case AMDGPU::TXD: { 187 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 188 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 189 MachineOperand &RID = MI->getOperand(4); 190 MachineOperand &SID = MI->getOperand(5); 191 unsigned TextureId = MI->getOperand(6).getImm(); 192 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 193 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 194 195 switch (TextureId) { 196 case 5: // Rect 197 CTX = CTY = 0; 198 break; 199 case 6: // Shadow1D 200 SrcW = SrcZ; 201 break; 202 case 7: // Shadow2D 203 SrcW = SrcZ; 204 break; 205 case 8: // ShadowRect 206 CTX = CTY = 0; 207 SrcW = SrcZ; 208 break; 209 case 9: // 1DArray 210 SrcZ = SrcY; 211 CTZ = 0; 212 break; 213 case 10: // 2DArray 214 CTZ = 0; 215 break; 216 case 11: // Shadow1DArray 217 SrcZ = SrcY; 218 CTZ = 0; 219 break; 220 case 12: // Shadow2DArray 221 CTZ = 0; 222 break; 223 } 224 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 225 .addOperand(MI->getOperand(3)) 226 .addImm(SrcX) 227 .addImm(SrcY) 228 .addImm(SrcZ) 229 .addImm(SrcW) 230 .addImm(0) 231 .addImm(0) 232 .addImm(0) 233 .addImm(0) 234 .addImm(1) 235 .addImm(2) 236 .addImm(3) 237 .addOperand(RID) 238 .addOperand(SID) 239 .addImm(CTX) 240 .addImm(CTY) 241 .addImm(CTZ) 242 .addImm(CTW); 243 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 244 .addOperand(MI->getOperand(2)) 245 .addImm(SrcX) 246 .addImm(SrcY) 247 .addImm(SrcZ) 248 .addImm(SrcW) 249 .addImm(0) 250 .addImm(0) 251 .addImm(0) 252 .addImm(0) 253 .addImm(1) 254 .addImm(2) 255 .addImm(3) 256 .addOperand(RID) 257 .addOperand(SID) 258 .addImm(CTX) 259 .addImm(CTY) 260 .addImm(CTZ) 261 .addImm(CTW); 262 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 263 .addOperand(MI->getOperand(0)) 264 .addOperand(MI->getOperand(1)) 265 .addImm(SrcX) 266 .addImm(SrcY) 267 .addImm(SrcZ) 268 .addImm(SrcW) 269 .addImm(0) 270 .addImm(0) 271 .addImm(0) 272 .addImm(0) 273 .addImm(1) 274 .addImm(2) 275 .addImm(3) 276 .addOperand(RID) 277 .addOperand(SID) 278 .addImm(CTX) 279 .addImm(CTY) 280 .addImm(CTZ) 281 .addImm(CTW) 282 .addReg(T0, RegState::Implicit) 283 .addReg(T1, RegState::Implicit); 284 break; 285 } 286 287 case AMDGPU::TXD_SHADOW: { 288 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 289 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 290 MachineOperand &RID = MI->getOperand(4); 291 MachineOperand &SID = MI->getOperand(5); 292 unsigned TextureId = MI->getOperand(6).getImm(); 293 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 294 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 295 296 switch (TextureId) { 297 case 5: // Rect 298 CTX = CTY = 0; 299 break; 300 case 6: // Shadow1D 301 SrcW = SrcZ; 302 break; 303 case 7: // Shadow2D 304 SrcW = SrcZ; 305 break; 306 case 8: // ShadowRect 307 CTX = CTY = 0; 308 SrcW = SrcZ; 309 break; 310 case 9: // 1DArray 311 SrcZ = SrcY; 312 CTZ = 0; 313 break; 314 case 10: // 2DArray 315 CTZ = 0; 316 break; 317 case 11: // Shadow1DArray 318 SrcZ = SrcY; 319 CTZ = 0; 320 break; 321 case 12: // Shadow2DArray 322 CTZ = 0; 323 break; 324 } 325 326 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 327 .addOperand(MI->getOperand(3)) 328 .addImm(SrcX) 329 .addImm(SrcY) 330 .addImm(SrcZ) 331 .addImm(SrcW) 332 .addImm(0) 333 .addImm(0) 334 .addImm(0) 335 .addImm(0) 336 .addImm(1) 337 .addImm(2) 338 .addImm(3) 339 .addOperand(RID) 340 .addOperand(SID) 341 .addImm(CTX) 342 .addImm(CTY) 343 .addImm(CTZ) 344 .addImm(CTW); 345 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 346 .addOperand(MI->getOperand(2)) 347 .addImm(SrcX) 348 .addImm(SrcY) 349 .addImm(SrcZ) 350 .addImm(SrcW) 351 .addImm(0) 352 .addImm(0) 353 .addImm(0) 354 .addImm(0) 355 .addImm(1) 356 .addImm(2) 357 .addImm(3) 358 .addOperand(RID) 359 .addOperand(SID) 360 .addImm(CTX) 361 .addImm(CTY) 362 .addImm(CTZ) 363 .addImm(CTW); 364 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 365 .addOperand(MI->getOperand(0)) 366 .addOperand(MI->getOperand(1)) 367 .addImm(SrcX) 368 .addImm(SrcY) 369 .addImm(SrcZ) 370 .addImm(SrcW) 371 .addImm(0) 372 .addImm(0) 373 .addImm(0) 374 .addImm(0) 375 .addImm(1) 376 .addImm(2) 377 .addImm(3) 378 .addOperand(RID) 379 .addOperand(SID) 380 .addImm(CTX) 381 .addImm(CTY) 382 .addImm(CTZ) 383 .addImm(CTW) 384 .addReg(T0, RegState::Implicit) 385 .addReg(T1, RegState::Implicit); 386 break; 387 } 388 389 case AMDGPU::BRANCH: 390 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 391 .addOperand(MI->getOperand(0)); 392 break; 393 394 case AMDGPU::BRANCH_COND_f32: { 395 MachineInstr *NewMI = 396 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 397 AMDGPU::PREDICATE_BIT) 398 .addOperand(MI->getOperand(1)) 399 .addImm(OPCODE_IS_NOT_ZERO) 400 .addImm(0); // Flags 401 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 402 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 403 .addOperand(MI->getOperand(0)) 404 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 405 break; 406 } 407 408 case AMDGPU::BRANCH_COND_i32: { 409 MachineInstr *NewMI = 410 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 411 AMDGPU::PREDICATE_BIT) 412 .addOperand(MI->getOperand(1)) 413 .addImm(OPCODE_IS_NOT_ZERO_INT) 414 .addImm(0); // Flags 415 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 416 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 417 .addOperand(MI->getOperand(0)) 418 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 419 break; 420 } 421 422 case AMDGPU::EG_ExportSwz: 423 case AMDGPU::R600_ExportSwz: { 424 // Instruction is left unmodified if its not the last one of its type 425 bool isLastInstructionOfItsType = true; 426 unsigned InstExportType = MI->getOperand(1).getImm(); 427 for (MachineBasicBlock::iterator NextExportInst = llvm::next(I), 428 EndBlock = BB->end(); NextExportInst != EndBlock; 429 NextExportInst = llvm::next(NextExportInst)) { 430 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 431 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 432 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 433 .getImm(); 434 if (CurrentInstExportType == InstExportType) { 435 isLastInstructionOfItsType = false; 436 break; 437 } 438 } 439 } 440 bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0; 441 if (!EOP && !isLastInstructionOfItsType) 442 return BB; 443 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 444 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 445 .addOperand(MI->getOperand(0)) 446 .addOperand(MI->getOperand(1)) 447 .addOperand(MI->getOperand(2)) 448 .addOperand(MI->getOperand(3)) 449 .addOperand(MI->getOperand(4)) 450 .addOperand(MI->getOperand(5)) 451 .addOperand(MI->getOperand(6)) 452 .addImm(CfInst) 453 .addImm(EOP); 454 break; 455 } 456 case AMDGPU::RETURN: { 457 // RETURN instructions must have the live-out registers as implicit uses, 458 // otherwise they appear dead. 459 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 460 MachineInstrBuilder MIB(*MF, MI); 461 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 462 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 463 return BB; 464 } 465 } 466 467 MI->eraseFromParent(); 468 return BB; 469} 470 471//===----------------------------------------------------------------------===// 472// Custom DAG Lowering Operations 473//===----------------------------------------------------------------------===// 474 475SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 476 switch (Op.getOpcode()) { 477 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 478 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 479 case ISD::SELECT: return LowerSELECT(Op, DAG); 480 case ISD::STORE: return LowerSTORE(Op, DAG); 481 case ISD::LOAD: return LowerLOAD(Op, DAG); 482 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 483 case ISD::INTRINSIC_VOID: { 484 SDValue Chain = Op.getOperand(0); 485 unsigned IntrinsicID = 486 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 487 switch (IntrinsicID) { 488 case AMDGPUIntrinsic::AMDGPU_store_output: { 489 MachineFunction &MF = DAG.getMachineFunction(); 490 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 491 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 492 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 493 MFI->LiveOuts.push_back(Reg); 494 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); 495 } 496 case AMDGPUIntrinsic::R600_store_swizzle: { 497 const SDValue Args[8] = { 498 Chain, 499 Op.getOperand(2), // Export Value 500 Op.getOperand(3), // ArrayBase 501 Op.getOperand(4), // Type 502 DAG.getConstant(0, MVT::i32), // SWZ_X 503 DAG.getConstant(1, MVT::i32), // SWZ_Y 504 DAG.getConstant(2, MVT::i32), // SWZ_Z 505 DAG.getConstant(3, MVT::i32) // SWZ_W 506 }; 507 return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), 508 Args, 8); 509 } 510 511 // default for switch(IntrinsicID) 512 default: break; 513 } 514 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 515 break; 516 } 517 case ISD::INTRINSIC_WO_CHAIN: { 518 unsigned IntrinsicID = 519 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 520 EVT VT = Op.getValueType(); 521 SDLoc DL(Op); 522 switch(IntrinsicID) { 523 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 524 case AMDGPUIntrinsic::R600_load_input: { 525 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 526 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 527 MachineFunction &MF = DAG.getMachineFunction(); 528 MachineRegisterInfo &MRI = MF.getRegInfo(); 529 MRI.addLiveIn(Reg); 530 return DAG.getCopyFromReg(DAG.getEntryNode(), 531 SDLoc(DAG.getEntryNode()), Reg, VT); 532 } 533 534 case AMDGPUIntrinsic::R600_interp_input: { 535 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 536 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 537 MachineSDNode *interp; 538 if (ijb < 0) { 539 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 540 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); 541 return DAG.getTargetExtractSubreg( 542 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 543 DL, MVT::f32, SDValue(interp, 0)); 544 } 545 546 MachineFunction &MF = DAG.getMachineFunction(); 547 MachineRegisterInfo &MRI = MF.getRegInfo(); 548 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); 549 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); 550 MRI.addLiveIn(RegisterI); 551 MRI.addLiveIn(RegisterJ); 552 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), 553 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); 554 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), 555 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); 556 557 if (slot % 4 < 2) 558 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 559 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 560 RegisterJNode, RegisterINode); 561 else 562 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 563 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 564 RegisterJNode, RegisterINode); 565 return SDValue(interp, slot % 2); 566 } 567 case AMDGPUIntrinsic::R600_tex: 568 case AMDGPUIntrinsic::R600_texc: 569 case AMDGPUIntrinsic::R600_txl: 570 case AMDGPUIntrinsic::R600_txlc: 571 case AMDGPUIntrinsic::R600_txb: 572 case AMDGPUIntrinsic::R600_txbc: 573 case AMDGPUIntrinsic::R600_txf: 574 case AMDGPUIntrinsic::R600_txq: 575 case AMDGPUIntrinsic::R600_ddx: 576 case AMDGPUIntrinsic::R600_ddy: { 577 unsigned TextureOp; 578 switch (IntrinsicID) { 579 case AMDGPUIntrinsic::R600_tex: 580 TextureOp = 0; 581 break; 582 case AMDGPUIntrinsic::R600_texc: 583 TextureOp = 1; 584 break; 585 case AMDGPUIntrinsic::R600_txl: 586 TextureOp = 2; 587 break; 588 case AMDGPUIntrinsic::R600_txlc: 589 TextureOp = 3; 590 break; 591 case AMDGPUIntrinsic::R600_txb: 592 TextureOp = 4; 593 break; 594 case AMDGPUIntrinsic::R600_txbc: 595 TextureOp = 5; 596 break; 597 case AMDGPUIntrinsic::R600_txf: 598 TextureOp = 6; 599 break; 600 case AMDGPUIntrinsic::R600_txq: 601 TextureOp = 7; 602 break; 603 case AMDGPUIntrinsic::R600_ddx: 604 TextureOp = 8; 605 break; 606 case AMDGPUIntrinsic::R600_ddy: 607 TextureOp = 9; 608 break; 609 default: 610 llvm_unreachable("Unknow Texture Operation"); 611 } 612 613 SDValue TexArgs[19] = { 614 DAG.getConstant(TextureOp, MVT::i32), 615 Op.getOperand(1), 616 DAG.getConstant(0, MVT::i32), 617 DAG.getConstant(1, MVT::i32), 618 DAG.getConstant(2, MVT::i32), 619 DAG.getConstant(3, MVT::i32), 620 Op.getOperand(2), 621 Op.getOperand(3), 622 Op.getOperand(4), 623 DAG.getConstant(0, MVT::i32), 624 DAG.getConstant(1, MVT::i32), 625 DAG.getConstant(2, MVT::i32), 626 DAG.getConstant(3, MVT::i32), 627 Op.getOperand(5), 628 Op.getOperand(6), 629 Op.getOperand(7), 630 Op.getOperand(8), 631 Op.getOperand(9), 632 Op.getOperand(10) 633 }; 634 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19); 635 } 636 case AMDGPUIntrinsic::AMDGPU_dp4: { 637 SDValue Args[8] = { 638 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 639 DAG.getConstant(0, MVT::i32)), 640 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 641 DAG.getConstant(0, MVT::i32)), 642 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 643 DAG.getConstant(1, MVT::i32)), 644 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 645 DAG.getConstant(1, MVT::i32)), 646 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 647 DAG.getConstant(2, MVT::i32)), 648 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 649 DAG.getConstant(2, MVT::i32)), 650 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 651 DAG.getConstant(3, MVT::i32)), 652 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 653 DAG.getConstant(3, MVT::i32)) 654 }; 655 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8); 656 } 657 658 case Intrinsic::r600_read_ngroups_x: 659 return LowerImplicitParameter(DAG, VT, DL, 0); 660 case Intrinsic::r600_read_ngroups_y: 661 return LowerImplicitParameter(DAG, VT, DL, 1); 662 case Intrinsic::r600_read_ngroups_z: 663 return LowerImplicitParameter(DAG, VT, DL, 2); 664 case Intrinsic::r600_read_global_size_x: 665 return LowerImplicitParameter(DAG, VT, DL, 3); 666 case Intrinsic::r600_read_global_size_y: 667 return LowerImplicitParameter(DAG, VT, DL, 4); 668 case Intrinsic::r600_read_global_size_z: 669 return LowerImplicitParameter(DAG, VT, DL, 5); 670 case Intrinsic::r600_read_local_size_x: 671 return LowerImplicitParameter(DAG, VT, DL, 6); 672 case Intrinsic::r600_read_local_size_y: 673 return LowerImplicitParameter(DAG, VT, DL, 7); 674 case Intrinsic::r600_read_local_size_z: 675 return LowerImplicitParameter(DAG, VT, DL, 8); 676 677 case Intrinsic::r600_read_tgid_x: 678 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 679 AMDGPU::T1_X, VT); 680 case Intrinsic::r600_read_tgid_y: 681 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 682 AMDGPU::T1_Y, VT); 683 case Intrinsic::r600_read_tgid_z: 684 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 685 AMDGPU::T1_Z, VT); 686 case Intrinsic::r600_read_tidig_x: 687 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 688 AMDGPU::T0_X, VT); 689 case Intrinsic::r600_read_tidig_y: 690 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 691 AMDGPU::T0_Y, VT); 692 case Intrinsic::r600_read_tidig_z: 693 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 694 AMDGPU::T0_Z, VT); 695 } 696 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 697 break; 698 } 699 } // end switch(Op.getOpcode()) 700 return SDValue(); 701} 702 703void R600TargetLowering::ReplaceNodeResults(SDNode *N, 704 SmallVectorImpl<SDValue> &Results, 705 SelectionDAG &DAG) const { 706 switch (N->getOpcode()) { 707 default: return; 708 case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 709 return; 710 case ISD::LOAD: { 711 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); 712 Results.push_back(SDValue(Node, 0)); 713 Results.push_back(SDValue(Node, 1)); 714 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode 715 // function 716 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); 717 return; 718 } 719 case ISD::STORE: 720 SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); 721 Results.push_back(SDValue(Node, 0)); 722 return; 723 } 724} 725 726SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 727 return DAG.getNode( 728 ISD::SETCC, 729 SDLoc(Op), 730 MVT::i1, 731 Op, DAG.getConstantFP(0.0f, MVT::f32), 732 DAG.getCondCode(ISD::SETNE) 733 ); 734} 735 736SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 737 SDLoc DL, 738 unsigned DwordOffset) const { 739 unsigned ByteOffset = DwordOffset * 4; 740 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 741 AMDGPUAS::PARAM_I_ADDRESS); 742 743 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 744 assert(isInt<16>(ByteOffset)); 745 746 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 747 DAG.getConstant(ByteOffset, MVT::i32), // PTR 748 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 749 false, false, false, 0); 750} 751 752SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 753 754 MachineFunction &MF = DAG.getMachineFunction(); 755 const AMDGPUFrameLowering *TFL = 756 static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); 757 758 FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); 759 assert(FIN); 760 761 unsigned FrameIndex = FIN->getIndex(); 762 unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); 763 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32); 764} 765 766bool R600TargetLowering::isZero(SDValue Op) const { 767 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 768 return Cst->isNullValue(); 769 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 770 return CstFP->isZero(); 771 } else { 772 return false; 773 } 774} 775 776SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 777 SDLoc DL(Op); 778 EVT VT = Op.getValueType(); 779 780 SDValue LHS = Op.getOperand(0); 781 SDValue RHS = Op.getOperand(1); 782 SDValue True = Op.getOperand(2); 783 SDValue False = Op.getOperand(3); 784 SDValue CC = Op.getOperand(4); 785 SDValue Temp; 786 787 // LHS and RHS are guaranteed to be the same value type 788 EVT CompareVT = LHS.getValueType(); 789 790 // Check if we can lower this to a native operation. 791 792 // Try to lower to a SET* instruction: 793 // 794 // SET* can match the following patterns: 795 // 796 // select_cc f32, f32, -1, 0, cc_any 797 // select_cc f32, f32, 1.0f, 0.0f, cc_any 798 // select_cc i32, i32, -1, 0, cc_any 799 // 800 801 // Move hardware True/False values to the correct operand. 802 if (isHWTrueValue(False) && isHWFalseValue(True)) { 803 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 804 std::swap(False, True); 805 CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32)); 806 } 807 808 if (isHWTrueValue(True) && isHWFalseValue(False) && 809 (CompareVT == VT || VT == MVT::i32)) { 810 // This can be matched by a SET* instruction. 811 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 812 } 813 814 // Try to lower to a CND* instruction: 815 // 816 // CND* can match the following patterns: 817 // 818 // select_cc f32, 0.0, f32, f32, cc_any 819 // select_cc f32, 0.0, i32, i32, cc_any 820 // select_cc i32, 0, f32, f32, cc_any 821 // select_cc i32, 0, i32, i32, cc_any 822 // 823 if (isZero(LHS) || isZero(RHS)) { 824 SDValue Cond = (isZero(LHS) ? RHS : LHS); 825 SDValue Zero = (isZero(LHS) ? LHS : RHS); 826 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 827 if (CompareVT != VT) { 828 // Bitcast True / False to the correct types. This will end up being 829 // a nop, but it allows us to define only a single pattern in the 830 // .TD files for each CND* instruction rather than having to have 831 // one pattern for integer True/False and one for fp True/False 832 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 833 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 834 } 835 if (isZero(LHS)) { 836 CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode); 837 } 838 839 switch (CCOpcode) { 840 case ISD::SETONE: 841 case ISD::SETUNE: 842 case ISD::SETNE: 843 case ISD::SETULE: 844 case ISD::SETULT: 845 case ISD::SETOLE: 846 case ISD::SETOLT: 847 case ISD::SETLE: 848 case ISD::SETLT: 849 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 850 Temp = True; 851 True = False; 852 False = Temp; 853 break; 854 default: 855 break; 856 } 857 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 858 Cond, Zero, 859 True, False, 860 DAG.getCondCode(CCOpcode)); 861 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 862 } 863 864 865 // Possible Min/Max pattern 866 SDValue MinMax = LowerMinMax(Op, DAG); 867 if (MinMax.getNode()) { 868 return MinMax; 869 } 870 871 // If we make it this for it means we have no native instructions to handle 872 // this SELECT_CC, so we must lower it. 873 SDValue HWTrue, HWFalse; 874 875 if (CompareVT == MVT::f32) { 876 HWTrue = DAG.getConstantFP(1.0f, CompareVT); 877 HWFalse = DAG.getConstantFP(0.0f, CompareVT); 878 } else if (CompareVT == MVT::i32) { 879 HWTrue = DAG.getConstant(-1, CompareVT); 880 HWFalse = DAG.getConstant(0, CompareVT); 881 } 882 else { 883 assert(!"Unhandled value type in LowerSELECT_CC"); 884 } 885 886 // Lower this unsupported SELECT_CC into a combination of two supported 887 // SELECT_CC operations. 888 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 889 890 return DAG.getNode(ISD::SELECT_CC, DL, VT, 891 Cond, HWFalse, 892 True, False, 893 DAG.getCondCode(ISD::SETNE)); 894} 895 896SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 897 return DAG.getNode(ISD::SELECT_CC, 898 SDLoc(Op), 899 Op.getValueType(), 900 Op.getOperand(0), 901 DAG.getConstant(0, MVT::i32), 902 Op.getOperand(1), 903 Op.getOperand(2), 904 DAG.getCondCode(ISD::SETNE)); 905} 906 907/// LLVM generates byte-addresed pointers. For indirect addressing, we need to 908/// convert these pointers to a register index. Each register holds 909/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 910/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 911/// for indirect addressing. 912SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 913 unsigned StackWidth, 914 SelectionDAG &DAG) const { 915 unsigned SRLPad; 916 switch(StackWidth) { 917 case 1: 918 SRLPad = 2; 919 break; 920 case 2: 921 SRLPad = 3; 922 break; 923 case 4: 924 SRLPad = 4; 925 break; 926 default: llvm_unreachable("Invalid stack width"); 927 } 928 929 return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr, 930 DAG.getConstant(SRLPad, MVT::i32)); 931} 932 933void R600TargetLowering::getStackAddress(unsigned StackWidth, 934 unsigned ElemIdx, 935 unsigned &Channel, 936 unsigned &PtrIncr) const { 937 switch (StackWidth) { 938 default: 939 case 1: 940 Channel = 0; 941 if (ElemIdx > 0) { 942 PtrIncr = 1; 943 } else { 944 PtrIncr = 0; 945 } 946 break; 947 case 2: 948 Channel = ElemIdx % 2; 949 if (ElemIdx == 2) { 950 PtrIncr = 1; 951 } else { 952 PtrIncr = 0; 953 } 954 break; 955 case 4: 956 Channel = ElemIdx; 957 PtrIncr = 0; 958 break; 959 } 960} 961 962SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 963 SDLoc DL(Op); 964 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 965 SDValue Chain = Op.getOperand(0); 966 SDValue Value = Op.getOperand(1); 967 SDValue Ptr = Op.getOperand(2); 968 969 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 970 Ptr->getOpcode() != AMDGPUISD::DWORDADDR) { 971 // Convert pointer from byte address to dword address. 972 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 973 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 974 Ptr, DAG.getConstant(2, MVT::i32))); 975 976 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 977 assert(!"Truncated and indexed stores not supported yet"); 978 } else { 979 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 980 } 981 return Chain; 982 } 983 984 EVT ValueVT = Value.getValueType(); 985 986 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 987 return SDValue(); 988 } 989 990 // Lowering for indirect addressing 991 992 const MachineFunction &MF = DAG.getMachineFunction(); 993 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 994 getTargetMachine().getFrameLowering()); 995 unsigned StackWidth = TFL->getStackWidth(MF); 996 997 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 998 999 if (ValueVT.isVector()) { 1000 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1001 EVT ElemVT = ValueVT.getVectorElementType(); 1002 SDValue Stores[4]; 1003 1004 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1005 "vector width in load"); 1006 1007 for (unsigned i = 0; i < NumElemVT; ++i) { 1008 unsigned Channel, PtrIncr; 1009 getStackAddress(StackWidth, i, Channel, PtrIncr); 1010 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1011 DAG.getConstant(PtrIncr, MVT::i32)); 1012 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1013 Value, DAG.getConstant(i, MVT::i32)); 1014 1015 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1016 Chain, Elem, Ptr, 1017 DAG.getTargetConstant(Channel, MVT::i32)); 1018 } 1019 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); 1020 } else { 1021 if (ValueVT == MVT::i8) { 1022 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1023 } 1024 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1025 DAG.getTargetConstant(0, MVT::i32)); // Channel 1026 } 1027 1028 return Chain; 1029} 1030 1031// return (512 + (kc_bank << 12) 1032static int 1033ConstantAddressBlock(unsigned AddressSpace) { 1034 switch (AddressSpace) { 1035 case AMDGPUAS::CONSTANT_BUFFER_0: 1036 return 512; 1037 case AMDGPUAS::CONSTANT_BUFFER_1: 1038 return 512 + 4096; 1039 case AMDGPUAS::CONSTANT_BUFFER_2: 1040 return 512 + 4096 * 2; 1041 case AMDGPUAS::CONSTANT_BUFFER_3: 1042 return 512 + 4096 * 3; 1043 case AMDGPUAS::CONSTANT_BUFFER_4: 1044 return 512 + 4096 * 4; 1045 case AMDGPUAS::CONSTANT_BUFFER_5: 1046 return 512 + 4096 * 5; 1047 case AMDGPUAS::CONSTANT_BUFFER_6: 1048 return 512 + 4096 * 6; 1049 case AMDGPUAS::CONSTANT_BUFFER_7: 1050 return 512 + 4096 * 7; 1051 case AMDGPUAS::CONSTANT_BUFFER_8: 1052 return 512 + 4096 * 8; 1053 case AMDGPUAS::CONSTANT_BUFFER_9: 1054 return 512 + 4096 * 9; 1055 case AMDGPUAS::CONSTANT_BUFFER_10: 1056 return 512 + 4096 * 10; 1057 case AMDGPUAS::CONSTANT_BUFFER_11: 1058 return 512 + 4096 * 11; 1059 case AMDGPUAS::CONSTANT_BUFFER_12: 1060 return 512 + 4096 * 12; 1061 case AMDGPUAS::CONSTANT_BUFFER_13: 1062 return 512 + 4096 * 13; 1063 case AMDGPUAS::CONSTANT_BUFFER_14: 1064 return 512 + 4096 * 14; 1065 case AMDGPUAS::CONSTANT_BUFFER_15: 1066 return 512 + 4096 * 15; 1067 default: 1068 return -1; 1069 } 1070} 1071 1072SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 1073{ 1074 EVT VT = Op.getValueType(); 1075 SDLoc DL(Op); 1076 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1077 SDValue Chain = Op.getOperand(0); 1078 SDValue Ptr = Op.getOperand(1); 1079 SDValue LoweredLoad; 1080 1081 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1082 if (ConstantBlock > -1) { 1083 SDValue Result; 1084 if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) || 1085 dyn_cast<Constant>(LoadNode->getSrcValue()) || 1086 dyn_cast<ConstantSDNode>(Ptr)) { 1087 SDValue Slots[4]; 1088 for (unsigned i = 0; i < 4; i++) { 1089 // We want Const position encoded with the following formula : 1090 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1091 // const_index is Ptr computed by llvm using an alignment of 16. 1092 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1093 // then div by 4 at the ISel step 1094 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1095 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); 1096 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1097 } 1098 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4); 1099 } else { 1100 // non constant ptr cant be folded, keeps it as a v4f32 load 1101 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1102 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), 1103 DAG.getConstant(LoadNode->getAddressSpace() - 1104 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) 1105 ); 1106 } 1107 1108 if (!VT.isVector()) { 1109 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1110 DAG.getConstant(0, MVT::i32)); 1111 } 1112 1113 SDValue MergedValues[2] = { 1114 Result, 1115 Chain 1116 }; 1117 return DAG.getMergeValues(MergedValues, 2, DL); 1118 } 1119 1120 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1121 return SDValue(); 1122 } 1123 1124 // Lowering for indirect addressing 1125 const MachineFunction &MF = DAG.getMachineFunction(); 1126 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 1127 getTargetMachine().getFrameLowering()); 1128 unsigned StackWidth = TFL->getStackWidth(MF); 1129 1130 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1131 1132 if (VT.isVector()) { 1133 unsigned NumElemVT = VT.getVectorNumElements(); 1134 EVT ElemVT = VT.getVectorElementType(); 1135 SDValue Loads[4]; 1136 1137 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1138 "vector width in load"); 1139 1140 for (unsigned i = 0; i < NumElemVT; ++i) { 1141 unsigned Channel, PtrIncr; 1142 getStackAddress(StackWidth, i, Channel, PtrIncr); 1143 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1144 DAG.getConstant(PtrIncr, MVT::i32)); 1145 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1146 Chain, Ptr, 1147 DAG.getTargetConstant(Channel, MVT::i32), 1148 Op.getOperand(2)); 1149 } 1150 for (unsigned i = NumElemVT; i < 4; ++i) { 1151 Loads[i] = DAG.getUNDEF(ElemVT); 1152 } 1153 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 1154 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); 1155 } else { 1156 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1157 Chain, Ptr, 1158 DAG.getTargetConstant(0, MVT::i32), // Channel 1159 Op.getOperand(2)); 1160 } 1161 1162 SDValue Ops[2]; 1163 Ops[0] = LoweredLoad; 1164 Ops[1] = Chain; 1165 1166 return DAG.getMergeValues(Ops, 2, DL); 1167} 1168 1169/// XXX Only kernel functions are supported, so we can assume for now that 1170/// every function is a kernel function, but in the future we should use 1171/// separate calling conventions for kernel and non-kernel functions. 1172SDValue R600TargetLowering::LowerFormalArguments( 1173 SDValue Chain, 1174 CallingConv::ID CallConv, 1175 bool isVarArg, 1176 const SmallVectorImpl<ISD::InputArg> &Ins, 1177 SDLoc DL, SelectionDAG &DAG, 1178 SmallVectorImpl<SDValue> &InVals) const { 1179 unsigned ParamOffsetBytes = 36; 1180 Function::const_arg_iterator FuncArg = 1181 DAG.getMachineFunction().getFunction()->arg_begin(); 1182 for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) { 1183 EVT VT = Ins[i].VT; 1184 Type *ArgType = FuncArg->getType(); 1185 unsigned ArgSizeInBits = ArgType->isPointerTy() ? 1186 32 : ArgType->getPrimitiveSizeInBits(); 1187 unsigned ArgBytes = ArgSizeInBits >> 3; 1188 EVT ArgVT; 1189 if (ArgSizeInBits < VT.getSizeInBits()) { 1190 assert(!ArgType->isFloatTy() && 1191 "Extending floating point arguments not supported yet"); 1192 ArgVT = MVT::getIntegerVT(ArgSizeInBits); 1193 } else { 1194 ArgVT = VT; 1195 } 1196 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1197 AMDGPUAS::PARAM_I_ADDRESS); 1198 SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), 1199 DAG.getConstant(ParamOffsetBytes, MVT::i32), 1200 MachinePointerInfo(UndefValue::get(PtrTy)), 1201 ArgVT, false, false, ArgBytes); 1202 InVals.push_back(Arg); 1203 ParamOffsetBytes += ArgBytes; 1204 } 1205 return Chain; 1206} 1207 1208EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1209 if (!VT.isVector()) return MVT::i32; 1210 return VT.changeVectorElementTypeToInteger(); 1211} 1212 1213//===----------------------------------------------------------------------===// 1214// Custom DAG Optimizations 1215//===----------------------------------------------------------------------===// 1216 1217SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1218 DAGCombinerInfo &DCI) const { 1219 SelectionDAG &DAG = DCI.DAG; 1220 1221 switch (N->getOpcode()) { 1222 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1223 case ISD::FP_ROUND: { 1224 SDValue Arg = N->getOperand(0); 1225 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1226 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1227 Arg.getOperand(0)); 1228 } 1229 break; 1230 } 1231 1232 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1233 // (i32 select_cc f32, f32, -1, 0 cc) 1234 // 1235 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1236 // this to one of the SET*_DX10 instructions. 1237 case ISD::FP_TO_SINT: { 1238 SDValue FNeg = N->getOperand(0); 1239 if (FNeg.getOpcode() != ISD::FNEG) { 1240 return SDValue(); 1241 } 1242 SDValue SelectCC = FNeg.getOperand(0); 1243 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1244 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1245 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1246 !isHWTrueValue(SelectCC.getOperand(2)) || 1247 !isHWFalseValue(SelectCC.getOperand(3))) { 1248 return SDValue(); 1249 } 1250 1251 return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), 1252 SelectCC.getOperand(0), // LHS 1253 SelectCC.getOperand(1), // RHS 1254 DAG.getConstant(-1, MVT::i32), // True 1255 DAG.getConstant(0, MVT::i32), // Flase 1256 SelectCC.getOperand(4)); // CC 1257 1258 break; 1259 } 1260 // Extract_vec (Build_vector) generated by custom lowering 1261 // also needs to be customly combined 1262 case ISD::EXTRACT_VECTOR_ELT: { 1263 SDValue Arg = N->getOperand(0); 1264 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1265 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1266 unsigned Element = Const->getZExtValue(); 1267 return Arg->getOperand(Element); 1268 } 1269 } 1270 if (Arg.getOpcode() == ISD::BITCAST && 1271 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 1272 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1273 unsigned Element = Const->getZExtValue(); 1274 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 1275 Arg->getOperand(0).getOperand(Element)); 1276 } 1277 } 1278 } 1279 1280 case ISD::SELECT_CC: { 1281 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1282 // selectcc x, y, a, b, inv(cc) 1283 // 1284 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1285 // selectcc x, y, a, b, cc 1286 SDValue LHS = N->getOperand(0); 1287 if (LHS.getOpcode() != ISD::SELECT_CC) { 1288 return SDValue(); 1289 } 1290 1291 SDValue RHS = N->getOperand(1); 1292 SDValue True = N->getOperand(2); 1293 SDValue False = N->getOperand(3); 1294 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1295 1296 if (LHS.getOperand(2).getNode() != True.getNode() || 1297 LHS.getOperand(3).getNode() != False.getNode() || 1298 RHS.getNode() != False.getNode()) { 1299 return SDValue(); 1300 } 1301 1302 switch (NCC) { 1303 default: return SDValue(); 1304 case ISD::SETNE: return LHS; 1305 case ISD::SETEQ: { 1306 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1307 LHSCC = ISD::getSetCCInverse(LHSCC, 1308 LHS.getOperand(0).getValueType().isInteger()); 1309 return DAG.getSelectCC(SDLoc(N), 1310 LHS.getOperand(0), 1311 LHS.getOperand(1), 1312 LHS.getOperand(2), 1313 LHS.getOperand(3), 1314 LHSCC); 1315 } 1316 } 1317 } 1318 case AMDGPUISD::EXPORT: { 1319 SDValue Arg = N->getOperand(1); 1320 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1321 break; 1322 SDValue NewBldVec[4] = { 1323 DAG.getUNDEF(MVT::f32), 1324 DAG.getUNDEF(MVT::f32), 1325 DAG.getUNDEF(MVT::f32), 1326 DAG.getUNDEF(MVT::f32) 1327 }; 1328 SDValue NewArgs[8] = { 1329 N->getOperand(0), // Chain 1330 SDValue(), 1331 N->getOperand(2), // ArrayBase 1332 N->getOperand(3), // Type 1333 N->getOperand(4), // SWZ_X 1334 N->getOperand(5), // SWZ_Y 1335 N->getOperand(6), // SWZ_Z 1336 N->getOperand(7) // SWZ_W 1337 }; 1338 for (unsigned i = 0; i < Arg.getNumOperands(); i++) { 1339 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) { 1340 if (C->isZero()) { 1341 NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0 1342 } else if (C->isExactlyValue(1.0)) { 1343 NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0 1344 } else { 1345 NewBldVec[i] = Arg.getOperand(i); 1346 } 1347 } else { 1348 NewBldVec[i] = Arg.getOperand(i); 1349 } 1350 } 1351 SDLoc DL(N); 1352 NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4); 1353 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8); 1354 } 1355 } 1356 return SDValue(); 1357} 1358