R600ISelLowering.cpp revision b5632b5b456db647b42239cbd4d8b58c82290c4e
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for R600 12// 13//===----------------------------------------------------------------------===// 14 15#include "R600ISelLowering.h" 16#include "R600Defines.h" 17#include "R600InstrInfo.h" 18#include "R600MachineFunctionInfo.h" 19#include "llvm/CodeGen/MachineFrameInfo.h" 20#include "llvm/CodeGen/MachineInstrBuilder.h" 21#include "llvm/CodeGen/MachineRegisterInfo.h" 22#include "llvm/CodeGen/SelectionDAG.h" 23#include "llvm/IR/Argument.h" 24#include "llvm/IR/Function.h" 25 26using namespace llvm; 27 28R600TargetLowering::R600TargetLowering(TargetMachine &TM) : 29 AMDGPUTargetLowering(TM) { 30 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 31 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 32 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 33 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 34 computeRegisterProperties(); 35 36 setOperationAction(ISD::FADD, MVT::v4f32, Expand); 37 setOperationAction(ISD::FMUL, MVT::v4f32, Expand); 38 setOperationAction(ISD::FDIV, MVT::v4f32, Expand); 39 setOperationAction(ISD::FSUB, MVT::v4f32, Expand); 40 41 setOperationAction(ISD::ADD, MVT::v4i32, Expand); 42 setOperationAction(ISD::AND, MVT::v4i32, Expand); 43 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); 44 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); 45 setOperationAction(ISD::MUL, MVT::v2i32, Expand); 46 setOperationAction(ISD::MUL, MVT::v4i32, Expand); 47 setOperationAction(ISD::OR, MVT::v4i32, Expand); 48 setOperationAction(ISD::OR, MVT::v2i32, Expand); 49 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); 50 setOperationAction(ISD::SHL, MVT::v4i32, Expand); 51 setOperationAction(ISD::SHL, MVT::v2i32, Expand); 52 setOperationAction(ISD::SRL, MVT::v4i32, Expand); 53 setOperationAction(ISD::SRL, MVT::v2i32, Expand); 54 setOperationAction(ISD::SRA, MVT::v4i32, Expand); 55 setOperationAction(ISD::SRA, MVT::v2i32, Expand); 56 setOperationAction(ISD::SUB, MVT::v4i32, Expand); 57 setOperationAction(ISD::SUB, MVT::v2i32, Expand); 58 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); 59 setOperationAction(ISD::UDIV, MVT::v4i32, Expand); 60 setOperationAction(ISD::UREM, MVT::v4i32, Expand); 61 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 62 setOperationAction(ISD::XOR, MVT::v4i32, Expand); 63 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 64 65 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 66 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 67 68 setOperationAction(ISD::FSUB, MVT::f32, Expand); 69 70 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 71 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 72 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 73 74 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 75 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 76 77 setOperationAction(ISD::SETCC, MVT::i32, Expand); 78 setOperationAction(ISD::SETCC, MVT::f32, Expand); 79 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 80 81 setOperationAction(ISD::SELECT, MVT::i32, Custom); 82 setOperationAction(ISD::SELECT, MVT::f32, Custom); 83 84 setOperationAction(ISD::VSELECT, MVT::v4i32, Expand); 85 setOperationAction(ISD::VSELECT, MVT::v2i32, Expand); 86 87 // Legalize loads and stores to the private address space. 88 setOperationAction(ISD::LOAD, MVT::i32, Custom); 89 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 90 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 91 setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); 92 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 93 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 94 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom); 95 setOperationAction(ISD::STORE, MVT::i8, Custom); 96 setOperationAction(ISD::STORE, MVT::i32, Custom); 97 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 98 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 99 100 setOperationAction(ISD::LOAD, MVT::i32, Custom); 101 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 102 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 103 104 setTargetDAGCombine(ISD::FP_ROUND); 105 setTargetDAGCombine(ISD::FP_TO_SINT); 106 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 107 setTargetDAGCombine(ISD::SELECT_CC); 108 109 setBooleanContents(ZeroOrNegativeOneBooleanContent); 110 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 111 setSchedulingPreference(Sched::VLIW); 112} 113 114MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 115 MachineInstr * MI, MachineBasicBlock * BB) const { 116 MachineFunction * MF = BB->getParent(); 117 MachineRegisterInfo &MRI = MF->getRegInfo(); 118 MachineBasicBlock::iterator I = *MI; 119 const R600InstrInfo *TII = 120 static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo()); 121 122 switch (MI->getOpcode()) { 123 default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 124 case AMDGPU::CLAMP_R600: { 125 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 126 AMDGPU::MOV, 127 MI->getOperand(0).getReg(), 128 MI->getOperand(1).getReg()); 129 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 130 break; 131 } 132 133 case AMDGPU::FABS_R600: { 134 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 135 AMDGPU::MOV, 136 MI->getOperand(0).getReg(), 137 MI->getOperand(1).getReg()); 138 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 139 break; 140 } 141 142 case AMDGPU::FNEG_R600: { 143 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 144 AMDGPU::MOV, 145 MI->getOperand(0).getReg(), 146 MI->getOperand(1).getReg()); 147 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 148 break; 149 } 150 151 case AMDGPU::MASK_WRITE: { 152 unsigned maskedRegister = MI->getOperand(0).getReg(); 153 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 154 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 155 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 156 break; 157 } 158 159 case AMDGPU::MOV_IMM_F32: 160 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 161 MI->getOperand(1).getFPImm()->getValueAPF() 162 .bitcastToAPInt().getZExtValue()); 163 break; 164 case AMDGPU::MOV_IMM_I32: 165 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 166 MI->getOperand(1).getImm()); 167 break; 168 case AMDGPU::CONST_COPY: { 169 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 170 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 171 TII->setImmOperand(NewMI, R600Operands::SRC0_SEL, 172 MI->getOperand(1).getImm()); 173 break; 174 } 175 176 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 177 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 178 unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 179 180 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 181 .addOperand(MI->getOperand(0)) 182 .addOperand(MI->getOperand(1)) 183 .addImm(EOP); // Set End of program bit 184 break; 185 } 186 187 case AMDGPU::TXD: { 188 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 189 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 190 MachineOperand &RID = MI->getOperand(4); 191 MachineOperand &SID = MI->getOperand(5); 192 unsigned TextureId = MI->getOperand(6).getImm(); 193 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 194 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 195 196 switch (TextureId) { 197 case 5: // Rect 198 CTX = CTY = 0; 199 break; 200 case 6: // Shadow1D 201 SrcW = SrcZ; 202 break; 203 case 7: // Shadow2D 204 SrcW = SrcZ; 205 break; 206 case 8: // ShadowRect 207 CTX = CTY = 0; 208 SrcW = SrcZ; 209 break; 210 case 9: // 1DArray 211 SrcZ = SrcY; 212 CTZ = 0; 213 break; 214 case 10: // 2DArray 215 CTZ = 0; 216 break; 217 case 11: // Shadow1DArray 218 SrcZ = SrcY; 219 CTZ = 0; 220 break; 221 case 12: // Shadow2DArray 222 CTZ = 0; 223 break; 224 } 225 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 226 .addOperand(MI->getOperand(3)) 227 .addImm(SrcX) 228 .addImm(SrcY) 229 .addImm(SrcZ) 230 .addImm(SrcW) 231 .addImm(0) 232 .addImm(0) 233 .addImm(0) 234 .addImm(0) 235 .addImm(1) 236 .addImm(2) 237 .addImm(3) 238 .addOperand(RID) 239 .addOperand(SID) 240 .addImm(CTX) 241 .addImm(CTY) 242 .addImm(CTZ) 243 .addImm(CTW); 244 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 245 .addOperand(MI->getOperand(2)) 246 .addImm(SrcX) 247 .addImm(SrcY) 248 .addImm(SrcZ) 249 .addImm(SrcW) 250 .addImm(0) 251 .addImm(0) 252 .addImm(0) 253 .addImm(0) 254 .addImm(1) 255 .addImm(2) 256 .addImm(3) 257 .addOperand(RID) 258 .addOperand(SID) 259 .addImm(CTX) 260 .addImm(CTY) 261 .addImm(CTZ) 262 .addImm(CTW); 263 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 264 .addOperand(MI->getOperand(0)) 265 .addOperand(MI->getOperand(1)) 266 .addImm(SrcX) 267 .addImm(SrcY) 268 .addImm(SrcZ) 269 .addImm(SrcW) 270 .addImm(0) 271 .addImm(0) 272 .addImm(0) 273 .addImm(0) 274 .addImm(1) 275 .addImm(2) 276 .addImm(3) 277 .addOperand(RID) 278 .addOperand(SID) 279 .addImm(CTX) 280 .addImm(CTY) 281 .addImm(CTZ) 282 .addImm(CTW) 283 .addReg(T0, RegState::Implicit) 284 .addReg(T1, RegState::Implicit); 285 break; 286 } 287 288 case AMDGPU::TXD_SHADOW: { 289 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 290 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 291 MachineOperand &RID = MI->getOperand(4); 292 MachineOperand &SID = MI->getOperand(5); 293 unsigned TextureId = MI->getOperand(6).getImm(); 294 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 295 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 296 297 switch (TextureId) { 298 case 5: // Rect 299 CTX = CTY = 0; 300 break; 301 case 6: // Shadow1D 302 SrcW = SrcZ; 303 break; 304 case 7: // Shadow2D 305 SrcW = SrcZ; 306 break; 307 case 8: // ShadowRect 308 CTX = CTY = 0; 309 SrcW = SrcZ; 310 break; 311 case 9: // 1DArray 312 SrcZ = SrcY; 313 CTZ = 0; 314 break; 315 case 10: // 2DArray 316 CTZ = 0; 317 break; 318 case 11: // Shadow1DArray 319 SrcZ = SrcY; 320 CTZ = 0; 321 break; 322 case 12: // Shadow2DArray 323 CTZ = 0; 324 break; 325 } 326 327 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 328 .addOperand(MI->getOperand(3)) 329 .addImm(SrcX) 330 .addImm(SrcY) 331 .addImm(SrcZ) 332 .addImm(SrcW) 333 .addImm(0) 334 .addImm(0) 335 .addImm(0) 336 .addImm(0) 337 .addImm(1) 338 .addImm(2) 339 .addImm(3) 340 .addOperand(RID) 341 .addOperand(SID) 342 .addImm(CTX) 343 .addImm(CTY) 344 .addImm(CTZ) 345 .addImm(CTW); 346 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 347 .addOperand(MI->getOperand(2)) 348 .addImm(SrcX) 349 .addImm(SrcY) 350 .addImm(SrcZ) 351 .addImm(SrcW) 352 .addImm(0) 353 .addImm(0) 354 .addImm(0) 355 .addImm(0) 356 .addImm(1) 357 .addImm(2) 358 .addImm(3) 359 .addOperand(RID) 360 .addOperand(SID) 361 .addImm(CTX) 362 .addImm(CTY) 363 .addImm(CTZ) 364 .addImm(CTW); 365 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 366 .addOperand(MI->getOperand(0)) 367 .addOperand(MI->getOperand(1)) 368 .addImm(SrcX) 369 .addImm(SrcY) 370 .addImm(SrcZ) 371 .addImm(SrcW) 372 .addImm(0) 373 .addImm(0) 374 .addImm(0) 375 .addImm(0) 376 .addImm(1) 377 .addImm(2) 378 .addImm(3) 379 .addOperand(RID) 380 .addOperand(SID) 381 .addImm(CTX) 382 .addImm(CTY) 383 .addImm(CTZ) 384 .addImm(CTW) 385 .addReg(T0, RegState::Implicit) 386 .addReg(T1, RegState::Implicit); 387 break; 388 } 389 390 case AMDGPU::BRANCH: 391 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 392 .addOperand(MI->getOperand(0)); 393 break; 394 395 case AMDGPU::BRANCH_COND_f32: { 396 MachineInstr *NewMI = 397 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 398 AMDGPU::PREDICATE_BIT) 399 .addOperand(MI->getOperand(1)) 400 .addImm(OPCODE_IS_NOT_ZERO) 401 .addImm(0); // Flags 402 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 403 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 404 .addOperand(MI->getOperand(0)) 405 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 406 break; 407 } 408 409 case AMDGPU::BRANCH_COND_i32: { 410 MachineInstr *NewMI = 411 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 412 AMDGPU::PREDICATE_BIT) 413 .addOperand(MI->getOperand(1)) 414 .addImm(OPCODE_IS_NOT_ZERO_INT) 415 .addImm(0); // Flags 416 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 417 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 418 .addOperand(MI->getOperand(0)) 419 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 420 break; 421 } 422 423 case AMDGPU::EG_ExportSwz: 424 case AMDGPU::R600_ExportSwz: { 425 // Instruction is left unmodified if its not the last one of its type 426 bool isLastInstructionOfItsType = true; 427 unsigned InstExportType = MI->getOperand(1).getImm(); 428 for (MachineBasicBlock::iterator NextExportInst = llvm::next(I), 429 EndBlock = BB->end(); NextExportInst != EndBlock; 430 NextExportInst = llvm::next(NextExportInst)) { 431 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 432 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 433 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 434 .getImm(); 435 if (CurrentInstExportType == InstExportType) { 436 isLastInstructionOfItsType = false; 437 break; 438 } 439 } 440 } 441 bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0; 442 if (!EOP && !isLastInstructionOfItsType) 443 return BB; 444 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 445 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 446 .addOperand(MI->getOperand(0)) 447 .addOperand(MI->getOperand(1)) 448 .addOperand(MI->getOperand(2)) 449 .addOperand(MI->getOperand(3)) 450 .addOperand(MI->getOperand(4)) 451 .addOperand(MI->getOperand(5)) 452 .addOperand(MI->getOperand(6)) 453 .addImm(CfInst) 454 .addImm(EOP); 455 break; 456 } 457 case AMDGPU::RETURN: { 458 // RETURN instructions must have the live-out registers as implicit uses, 459 // otherwise they appear dead. 460 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 461 MachineInstrBuilder MIB(*MF, MI); 462 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 463 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 464 return BB; 465 } 466 } 467 468 MI->eraseFromParent(); 469 return BB; 470} 471 472//===----------------------------------------------------------------------===// 473// Custom DAG Lowering Operations 474//===----------------------------------------------------------------------===// 475 476SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 477 switch (Op.getOpcode()) { 478 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 479 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 480 case ISD::SELECT: return LowerSELECT(Op, DAG); 481 case ISD::STORE: return LowerSTORE(Op, DAG); 482 case ISD::LOAD: return LowerLOAD(Op, DAG); 483 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 484 case ISD::INTRINSIC_VOID: { 485 SDValue Chain = Op.getOperand(0); 486 unsigned IntrinsicID = 487 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 488 switch (IntrinsicID) { 489 case AMDGPUIntrinsic::AMDGPU_store_output: { 490 MachineFunction &MF = DAG.getMachineFunction(); 491 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 492 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 493 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 494 MFI->LiveOuts.push_back(Reg); 495 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); 496 } 497 case AMDGPUIntrinsic::R600_store_swizzle: { 498 const SDValue Args[8] = { 499 Chain, 500 Op.getOperand(2), // Export Value 501 Op.getOperand(3), // ArrayBase 502 Op.getOperand(4), // Type 503 DAG.getConstant(0, MVT::i32), // SWZ_X 504 DAG.getConstant(1, MVT::i32), // SWZ_Y 505 DAG.getConstant(2, MVT::i32), // SWZ_Z 506 DAG.getConstant(3, MVT::i32) // SWZ_W 507 }; 508 return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), 509 Args, 8); 510 } 511 512 // default for switch(IntrinsicID) 513 default: break; 514 } 515 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 516 break; 517 } 518 case ISD::INTRINSIC_WO_CHAIN: { 519 unsigned IntrinsicID = 520 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 521 EVT VT = Op.getValueType(); 522 SDLoc DL(Op); 523 switch(IntrinsicID) { 524 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 525 case AMDGPUIntrinsic::R600_load_input: { 526 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 527 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 528 MachineFunction &MF = DAG.getMachineFunction(); 529 MachineRegisterInfo &MRI = MF.getRegInfo(); 530 MRI.addLiveIn(Reg); 531 return DAG.getCopyFromReg(DAG.getEntryNode(), 532 SDLoc(DAG.getEntryNode()), Reg, VT); 533 } 534 535 case AMDGPUIntrinsic::R600_interp_input: { 536 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 537 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 538 MachineSDNode *interp; 539 if (ijb < 0) { 540 const MachineFunction &MF = DAG.getMachineFunction(); 541 const R600InstrInfo *TII = 542 static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo()); 543 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 544 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); 545 return DAG.getTargetExtractSubreg( 546 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 547 DL, MVT::f32, SDValue(interp, 0)); 548 } 549 550 MachineFunction &MF = DAG.getMachineFunction(); 551 MachineRegisterInfo &MRI = MF.getRegInfo(); 552 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); 553 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); 554 MRI.addLiveIn(RegisterI); 555 MRI.addLiveIn(RegisterJ); 556 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), 557 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); 558 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), 559 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); 560 561 if (slot % 4 < 2) 562 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 563 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 564 RegisterJNode, RegisterINode); 565 else 566 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 567 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 568 RegisterJNode, RegisterINode); 569 return SDValue(interp, slot % 2); 570 } 571 case AMDGPUIntrinsic::R600_tex: 572 case AMDGPUIntrinsic::R600_texc: 573 case AMDGPUIntrinsic::R600_txl: 574 case AMDGPUIntrinsic::R600_txlc: 575 case AMDGPUIntrinsic::R600_txb: 576 case AMDGPUIntrinsic::R600_txbc: 577 case AMDGPUIntrinsic::R600_txf: 578 case AMDGPUIntrinsic::R600_txq: 579 case AMDGPUIntrinsic::R600_ddx: 580 case AMDGPUIntrinsic::R600_ddy: { 581 unsigned TextureOp; 582 switch (IntrinsicID) { 583 case AMDGPUIntrinsic::R600_tex: 584 TextureOp = 0; 585 break; 586 case AMDGPUIntrinsic::R600_texc: 587 TextureOp = 1; 588 break; 589 case AMDGPUIntrinsic::R600_txl: 590 TextureOp = 2; 591 break; 592 case AMDGPUIntrinsic::R600_txlc: 593 TextureOp = 3; 594 break; 595 case AMDGPUIntrinsic::R600_txb: 596 TextureOp = 4; 597 break; 598 case AMDGPUIntrinsic::R600_txbc: 599 TextureOp = 5; 600 break; 601 case AMDGPUIntrinsic::R600_txf: 602 TextureOp = 6; 603 break; 604 case AMDGPUIntrinsic::R600_txq: 605 TextureOp = 7; 606 break; 607 case AMDGPUIntrinsic::R600_ddx: 608 TextureOp = 8; 609 break; 610 case AMDGPUIntrinsic::R600_ddy: 611 TextureOp = 9; 612 break; 613 default: 614 llvm_unreachable("Unknow Texture Operation"); 615 } 616 617 SDValue TexArgs[19] = { 618 DAG.getConstant(TextureOp, MVT::i32), 619 Op.getOperand(1), 620 DAG.getConstant(0, MVT::i32), 621 DAG.getConstant(1, MVT::i32), 622 DAG.getConstant(2, MVT::i32), 623 DAG.getConstant(3, MVT::i32), 624 Op.getOperand(2), 625 Op.getOperand(3), 626 Op.getOperand(4), 627 DAG.getConstant(0, MVT::i32), 628 DAG.getConstant(1, MVT::i32), 629 DAG.getConstant(2, MVT::i32), 630 DAG.getConstant(3, MVT::i32), 631 Op.getOperand(5), 632 Op.getOperand(6), 633 Op.getOperand(7), 634 Op.getOperand(8), 635 Op.getOperand(9), 636 Op.getOperand(10) 637 }; 638 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19); 639 } 640 case AMDGPUIntrinsic::AMDGPU_dp4: { 641 SDValue Args[8] = { 642 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 643 DAG.getConstant(0, MVT::i32)), 644 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 645 DAG.getConstant(0, MVT::i32)), 646 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 647 DAG.getConstant(1, MVT::i32)), 648 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 649 DAG.getConstant(1, MVT::i32)), 650 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 651 DAG.getConstant(2, MVT::i32)), 652 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 653 DAG.getConstant(2, MVT::i32)), 654 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 655 DAG.getConstant(3, MVT::i32)), 656 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 657 DAG.getConstant(3, MVT::i32)) 658 }; 659 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8); 660 } 661 662 case Intrinsic::r600_read_ngroups_x: 663 return LowerImplicitParameter(DAG, VT, DL, 0); 664 case Intrinsic::r600_read_ngroups_y: 665 return LowerImplicitParameter(DAG, VT, DL, 1); 666 case Intrinsic::r600_read_ngroups_z: 667 return LowerImplicitParameter(DAG, VT, DL, 2); 668 case Intrinsic::r600_read_global_size_x: 669 return LowerImplicitParameter(DAG, VT, DL, 3); 670 case Intrinsic::r600_read_global_size_y: 671 return LowerImplicitParameter(DAG, VT, DL, 4); 672 case Intrinsic::r600_read_global_size_z: 673 return LowerImplicitParameter(DAG, VT, DL, 5); 674 case Intrinsic::r600_read_local_size_x: 675 return LowerImplicitParameter(DAG, VT, DL, 6); 676 case Intrinsic::r600_read_local_size_y: 677 return LowerImplicitParameter(DAG, VT, DL, 7); 678 case Intrinsic::r600_read_local_size_z: 679 return LowerImplicitParameter(DAG, VT, DL, 8); 680 681 case Intrinsic::r600_read_tgid_x: 682 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 683 AMDGPU::T1_X, VT); 684 case Intrinsic::r600_read_tgid_y: 685 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 686 AMDGPU::T1_Y, VT); 687 case Intrinsic::r600_read_tgid_z: 688 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 689 AMDGPU::T1_Z, VT); 690 case Intrinsic::r600_read_tidig_x: 691 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 692 AMDGPU::T0_X, VT); 693 case Intrinsic::r600_read_tidig_y: 694 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 695 AMDGPU::T0_Y, VT); 696 case Intrinsic::r600_read_tidig_z: 697 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 698 AMDGPU::T0_Z, VT); 699 } 700 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 701 break; 702 } 703 } // end switch(Op.getOpcode()) 704 return SDValue(); 705} 706 707void R600TargetLowering::ReplaceNodeResults(SDNode *N, 708 SmallVectorImpl<SDValue> &Results, 709 SelectionDAG &DAG) const { 710 switch (N->getOpcode()) { 711 default: return; 712 case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 713 return; 714 case ISD::LOAD: { 715 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); 716 Results.push_back(SDValue(Node, 0)); 717 Results.push_back(SDValue(Node, 1)); 718 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode 719 // function 720 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); 721 return; 722 } 723 case ISD::STORE: 724 SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); 725 Results.push_back(SDValue(Node, 0)); 726 return; 727 } 728} 729 730SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 731 return DAG.getNode( 732 ISD::SETCC, 733 SDLoc(Op), 734 MVT::i1, 735 Op, DAG.getConstantFP(0.0f, MVT::f32), 736 DAG.getCondCode(ISD::SETNE) 737 ); 738} 739 740SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 741 SDLoc DL, 742 unsigned DwordOffset) const { 743 unsigned ByteOffset = DwordOffset * 4; 744 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 745 AMDGPUAS::PARAM_I_ADDRESS); 746 747 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 748 assert(isInt<16>(ByteOffset)); 749 750 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 751 DAG.getConstant(ByteOffset, MVT::i32), // PTR 752 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 753 false, false, false, 0); 754} 755 756SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 757 758 MachineFunction &MF = DAG.getMachineFunction(); 759 const AMDGPUFrameLowering *TFL = 760 static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); 761 762 FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); 763 assert(FIN); 764 765 unsigned FrameIndex = FIN->getIndex(); 766 unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); 767 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32); 768} 769 770bool R600TargetLowering::isZero(SDValue Op) const { 771 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 772 return Cst->isNullValue(); 773 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 774 return CstFP->isZero(); 775 } else { 776 return false; 777 } 778} 779 780SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 781 SDLoc DL(Op); 782 EVT VT = Op.getValueType(); 783 784 SDValue LHS = Op.getOperand(0); 785 SDValue RHS = Op.getOperand(1); 786 SDValue True = Op.getOperand(2); 787 SDValue False = Op.getOperand(3); 788 SDValue CC = Op.getOperand(4); 789 SDValue Temp; 790 791 // LHS and RHS are guaranteed to be the same value type 792 EVT CompareVT = LHS.getValueType(); 793 794 // Check if we can lower this to a native operation. 795 796 // Try to lower to a SET* instruction: 797 // 798 // SET* can match the following patterns: 799 // 800 // select_cc f32, f32, -1, 0, cc_any 801 // select_cc f32, f32, 1.0f, 0.0f, cc_any 802 // select_cc i32, i32, -1, 0, cc_any 803 // 804 805 // Move hardware True/False values to the correct operand. 806 if (isHWTrueValue(False) && isHWFalseValue(True)) { 807 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 808 std::swap(False, True); 809 CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32)); 810 } 811 812 if (isHWTrueValue(True) && isHWFalseValue(False) && 813 (CompareVT == VT || VT == MVT::i32)) { 814 // This can be matched by a SET* instruction. 815 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 816 } 817 818 // Try to lower to a CND* instruction: 819 // 820 // CND* can match the following patterns: 821 // 822 // select_cc f32, 0.0, f32, f32, cc_any 823 // select_cc f32, 0.0, i32, i32, cc_any 824 // select_cc i32, 0, f32, f32, cc_any 825 // select_cc i32, 0, i32, i32, cc_any 826 // 827 if (isZero(LHS) || isZero(RHS)) { 828 SDValue Cond = (isZero(LHS) ? RHS : LHS); 829 SDValue Zero = (isZero(LHS) ? LHS : RHS); 830 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 831 if (CompareVT != VT) { 832 // Bitcast True / False to the correct types. This will end up being 833 // a nop, but it allows us to define only a single pattern in the 834 // .TD files for each CND* instruction rather than having to have 835 // one pattern for integer True/False and one for fp True/False 836 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 837 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 838 } 839 if (isZero(LHS)) { 840 CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode); 841 } 842 843 switch (CCOpcode) { 844 case ISD::SETONE: 845 case ISD::SETUNE: 846 case ISD::SETNE: 847 case ISD::SETULE: 848 case ISD::SETULT: 849 case ISD::SETOLE: 850 case ISD::SETOLT: 851 case ISD::SETLE: 852 case ISD::SETLT: 853 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 854 Temp = True; 855 True = False; 856 False = Temp; 857 break; 858 default: 859 break; 860 } 861 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 862 Cond, Zero, 863 True, False, 864 DAG.getCondCode(CCOpcode)); 865 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 866 } 867 868 869 // Possible Min/Max pattern 870 SDValue MinMax = LowerMinMax(Op, DAG); 871 if (MinMax.getNode()) { 872 return MinMax; 873 } 874 875 // If we make it this for it means we have no native instructions to handle 876 // this SELECT_CC, so we must lower it. 877 SDValue HWTrue, HWFalse; 878 879 if (CompareVT == MVT::f32) { 880 HWTrue = DAG.getConstantFP(1.0f, CompareVT); 881 HWFalse = DAG.getConstantFP(0.0f, CompareVT); 882 } else if (CompareVT == MVT::i32) { 883 HWTrue = DAG.getConstant(-1, CompareVT); 884 HWFalse = DAG.getConstant(0, CompareVT); 885 } 886 else { 887 assert(!"Unhandled value type in LowerSELECT_CC"); 888 } 889 890 // Lower this unsupported SELECT_CC into a combination of two supported 891 // SELECT_CC operations. 892 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 893 894 return DAG.getNode(ISD::SELECT_CC, DL, VT, 895 Cond, HWFalse, 896 True, False, 897 DAG.getCondCode(ISD::SETNE)); 898} 899 900SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 901 return DAG.getNode(ISD::SELECT_CC, 902 SDLoc(Op), 903 Op.getValueType(), 904 Op.getOperand(0), 905 DAG.getConstant(0, MVT::i32), 906 Op.getOperand(1), 907 Op.getOperand(2), 908 DAG.getCondCode(ISD::SETNE)); 909} 910 911/// LLVM generates byte-addresed pointers. For indirect addressing, we need to 912/// convert these pointers to a register index. Each register holds 913/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 914/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 915/// for indirect addressing. 916SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 917 unsigned StackWidth, 918 SelectionDAG &DAG) const { 919 unsigned SRLPad; 920 switch(StackWidth) { 921 case 1: 922 SRLPad = 2; 923 break; 924 case 2: 925 SRLPad = 3; 926 break; 927 case 4: 928 SRLPad = 4; 929 break; 930 default: llvm_unreachable("Invalid stack width"); 931 } 932 933 return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr, 934 DAG.getConstant(SRLPad, MVT::i32)); 935} 936 937void R600TargetLowering::getStackAddress(unsigned StackWidth, 938 unsigned ElemIdx, 939 unsigned &Channel, 940 unsigned &PtrIncr) const { 941 switch (StackWidth) { 942 default: 943 case 1: 944 Channel = 0; 945 if (ElemIdx > 0) { 946 PtrIncr = 1; 947 } else { 948 PtrIncr = 0; 949 } 950 break; 951 case 2: 952 Channel = ElemIdx % 2; 953 if (ElemIdx == 2) { 954 PtrIncr = 1; 955 } else { 956 PtrIncr = 0; 957 } 958 break; 959 case 4: 960 Channel = ElemIdx; 961 PtrIncr = 0; 962 break; 963 } 964} 965 966SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 967 SDLoc DL(Op); 968 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 969 SDValue Chain = Op.getOperand(0); 970 SDValue Value = Op.getOperand(1); 971 SDValue Ptr = Op.getOperand(2); 972 973 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 974 Ptr->getOpcode() != AMDGPUISD::DWORDADDR) { 975 // Convert pointer from byte address to dword address. 976 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 977 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 978 Ptr, DAG.getConstant(2, MVT::i32))); 979 980 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 981 assert(!"Truncated and indexed stores not supported yet"); 982 } else { 983 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 984 } 985 return Chain; 986 } 987 988 EVT ValueVT = Value.getValueType(); 989 990 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 991 return SDValue(); 992 } 993 994 // Lowering for indirect addressing 995 996 const MachineFunction &MF = DAG.getMachineFunction(); 997 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 998 getTargetMachine().getFrameLowering()); 999 unsigned StackWidth = TFL->getStackWidth(MF); 1000 1001 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1002 1003 if (ValueVT.isVector()) { 1004 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1005 EVT ElemVT = ValueVT.getVectorElementType(); 1006 SDValue Stores[4]; 1007 1008 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1009 "vector width in load"); 1010 1011 for (unsigned i = 0; i < NumElemVT; ++i) { 1012 unsigned Channel, PtrIncr; 1013 getStackAddress(StackWidth, i, Channel, PtrIncr); 1014 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1015 DAG.getConstant(PtrIncr, MVT::i32)); 1016 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1017 Value, DAG.getConstant(i, MVT::i32)); 1018 1019 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1020 Chain, Elem, Ptr, 1021 DAG.getTargetConstant(Channel, MVT::i32)); 1022 } 1023 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); 1024 } else { 1025 if (ValueVT == MVT::i8) { 1026 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1027 } 1028 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1029 DAG.getTargetConstant(0, MVT::i32)); // Channel 1030 } 1031 1032 return Chain; 1033} 1034 1035// return (512 + (kc_bank << 12) 1036static int 1037ConstantAddressBlock(unsigned AddressSpace) { 1038 switch (AddressSpace) { 1039 case AMDGPUAS::CONSTANT_BUFFER_0: 1040 return 512; 1041 case AMDGPUAS::CONSTANT_BUFFER_1: 1042 return 512 + 4096; 1043 case AMDGPUAS::CONSTANT_BUFFER_2: 1044 return 512 + 4096 * 2; 1045 case AMDGPUAS::CONSTANT_BUFFER_3: 1046 return 512 + 4096 * 3; 1047 case AMDGPUAS::CONSTANT_BUFFER_4: 1048 return 512 + 4096 * 4; 1049 case AMDGPUAS::CONSTANT_BUFFER_5: 1050 return 512 + 4096 * 5; 1051 case AMDGPUAS::CONSTANT_BUFFER_6: 1052 return 512 + 4096 * 6; 1053 case AMDGPUAS::CONSTANT_BUFFER_7: 1054 return 512 + 4096 * 7; 1055 case AMDGPUAS::CONSTANT_BUFFER_8: 1056 return 512 + 4096 * 8; 1057 case AMDGPUAS::CONSTANT_BUFFER_9: 1058 return 512 + 4096 * 9; 1059 case AMDGPUAS::CONSTANT_BUFFER_10: 1060 return 512 + 4096 * 10; 1061 case AMDGPUAS::CONSTANT_BUFFER_11: 1062 return 512 + 4096 * 11; 1063 case AMDGPUAS::CONSTANT_BUFFER_12: 1064 return 512 + 4096 * 12; 1065 case AMDGPUAS::CONSTANT_BUFFER_13: 1066 return 512 + 4096 * 13; 1067 case AMDGPUAS::CONSTANT_BUFFER_14: 1068 return 512 + 4096 * 14; 1069 case AMDGPUAS::CONSTANT_BUFFER_15: 1070 return 512 + 4096 * 15; 1071 default: 1072 return -1; 1073 } 1074} 1075 1076SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 1077{ 1078 EVT VT = Op.getValueType(); 1079 SDLoc DL(Op); 1080 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1081 SDValue Chain = Op.getOperand(0); 1082 SDValue Ptr = Op.getOperand(1); 1083 SDValue LoweredLoad; 1084 1085 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1086 if (ConstantBlock > -1) { 1087 SDValue Result; 1088 if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) || 1089 dyn_cast<Constant>(LoadNode->getSrcValue()) || 1090 dyn_cast<ConstantSDNode>(Ptr)) { 1091 SDValue Slots[4]; 1092 for (unsigned i = 0; i < 4; i++) { 1093 // We want Const position encoded with the following formula : 1094 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1095 // const_index is Ptr computed by llvm using an alignment of 16. 1096 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1097 // then div by 4 at the ISel step 1098 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1099 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); 1100 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1101 } 1102 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4); 1103 } else { 1104 // non constant ptr cant be folded, keeps it as a v4f32 load 1105 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1106 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), 1107 DAG.getConstant(LoadNode->getAddressSpace() - 1108 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) 1109 ); 1110 } 1111 1112 if (!VT.isVector()) { 1113 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1114 DAG.getConstant(0, MVT::i32)); 1115 } 1116 1117 SDValue MergedValues[2] = { 1118 Result, 1119 Chain 1120 }; 1121 return DAG.getMergeValues(MergedValues, 2, DL); 1122 } 1123 1124 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1125 return SDValue(); 1126 } 1127 1128 // Lowering for indirect addressing 1129 const MachineFunction &MF = DAG.getMachineFunction(); 1130 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 1131 getTargetMachine().getFrameLowering()); 1132 unsigned StackWidth = TFL->getStackWidth(MF); 1133 1134 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1135 1136 if (VT.isVector()) { 1137 unsigned NumElemVT = VT.getVectorNumElements(); 1138 EVT ElemVT = VT.getVectorElementType(); 1139 SDValue Loads[4]; 1140 1141 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1142 "vector width in load"); 1143 1144 for (unsigned i = 0; i < NumElemVT; ++i) { 1145 unsigned Channel, PtrIncr; 1146 getStackAddress(StackWidth, i, Channel, PtrIncr); 1147 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1148 DAG.getConstant(PtrIncr, MVT::i32)); 1149 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1150 Chain, Ptr, 1151 DAG.getTargetConstant(Channel, MVT::i32), 1152 Op.getOperand(2)); 1153 } 1154 for (unsigned i = NumElemVT; i < 4; ++i) { 1155 Loads[i] = DAG.getUNDEF(ElemVT); 1156 } 1157 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 1158 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); 1159 } else { 1160 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1161 Chain, Ptr, 1162 DAG.getTargetConstant(0, MVT::i32), // Channel 1163 Op.getOperand(2)); 1164 } 1165 1166 SDValue Ops[2]; 1167 Ops[0] = LoweredLoad; 1168 Ops[1] = Chain; 1169 1170 return DAG.getMergeValues(Ops, 2, DL); 1171} 1172 1173/// XXX Only kernel functions are supported, so we can assume for now that 1174/// every function is a kernel function, but in the future we should use 1175/// separate calling conventions for kernel and non-kernel functions. 1176SDValue R600TargetLowering::LowerFormalArguments( 1177 SDValue Chain, 1178 CallingConv::ID CallConv, 1179 bool isVarArg, 1180 const SmallVectorImpl<ISD::InputArg> &Ins, 1181 SDLoc DL, SelectionDAG &DAG, 1182 SmallVectorImpl<SDValue> &InVals) const { 1183 unsigned ParamOffsetBytes = 36; 1184 Function::const_arg_iterator FuncArg = 1185 DAG.getMachineFunction().getFunction()->arg_begin(); 1186 for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) { 1187 EVT VT = Ins[i].VT; 1188 Type *ArgType = FuncArg->getType(); 1189 unsigned ArgSizeInBits = ArgType->isPointerTy() ? 1190 32 : ArgType->getPrimitiveSizeInBits(); 1191 unsigned ArgBytes = ArgSizeInBits >> 3; 1192 EVT ArgVT; 1193 if (ArgSizeInBits < VT.getSizeInBits()) { 1194 assert(!ArgType->isFloatTy() && 1195 "Extending floating point arguments not supported yet"); 1196 ArgVT = MVT::getIntegerVT(ArgSizeInBits); 1197 } else { 1198 ArgVT = VT; 1199 } 1200 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1201 AMDGPUAS::PARAM_I_ADDRESS); 1202 SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), 1203 DAG.getConstant(ParamOffsetBytes, MVT::i32), 1204 MachinePointerInfo(UndefValue::get(PtrTy)), 1205 ArgVT, false, false, ArgBytes); 1206 InVals.push_back(Arg); 1207 ParamOffsetBytes += ArgBytes; 1208 } 1209 return Chain; 1210} 1211 1212EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1213 if (!VT.isVector()) return MVT::i32; 1214 return VT.changeVectorElementTypeToInteger(); 1215} 1216 1217SDValue CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry, 1218 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1219 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1220 assert(RemapSwizzle.empty()); 1221 SDValue NewBldVec[4] = { 1222 VectorEntry.getOperand(0), 1223 VectorEntry.getOperand(1), 1224 VectorEntry.getOperand(2), 1225 VectorEntry.getOperand(3) 1226 }; 1227 1228 for (unsigned i = 0; i < 4; i++) { 1229 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1230 if (C->isZero()) { 1231 RemapSwizzle[i] = 4; // SEL_0 1232 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1233 } else if (C->isExactlyValue(1.0)) { 1234 RemapSwizzle[i] = 5; // SEL_1 1235 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1236 } 1237 } 1238 1239 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1240 continue; 1241 for (unsigned j = 0; j < i; j++) { 1242 if (NewBldVec[i] == NewBldVec[j]) { 1243 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1244 RemapSwizzle[i] = j; 1245 break; 1246 } 1247 } 1248 } 1249 1250 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1251 VectorEntry.getValueType(), NewBldVec, 4); 1252} 1253 1254SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1255 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1256 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1257 assert(RemapSwizzle.empty()); 1258 SDValue NewBldVec[4] = { 1259 VectorEntry.getOperand(0), 1260 VectorEntry.getOperand(1), 1261 VectorEntry.getOperand(2), 1262 VectorEntry.getOperand(3) 1263 }; 1264 bool isUnmovable[4] = { false, false, false, false }; 1265 1266 for (unsigned i = 0; i < 4; i++) { 1267 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1268 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1269 ->getZExtValue(); 1270 if (!isUnmovable[Idx]) { 1271 // Swap i and Idx 1272 std::swap(NewBldVec[Idx], NewBldVec[i]); 1273 RemapSwizzle[Idx] = i; 1274 RemapSwizzle[i] = Idx; 1275 } 1276 isUnmovable[Idx] = true; 1277 } 1278 } 1279 1280 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1281 VectorEntry.getValueType(), NewBldVec, 4); 1282} 1283 1284 1285SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, 1286SDValue Swz[4], SelectionDAG &DAG) const { 1287 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 1288 // Old -> New swizzle values 1289 DenseMap<unsigned, unsigned> SwizzleRemap; 1290 1291 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1292 for (unsigned i = 0; i < 4; i++) { 1293 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1294 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1295 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1296 } 1297 1298 SwizzleRemap.clear(); 1299 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1300 for (unsigned i = 0; i < 4; i++) { 1301 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1302 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1303 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1304 } 1305 1306 return BuildVector; 1307} 1308 1309 1310//===----------------------------------------------------------------------===// 1311// Custom DAG Optimizations 1312//===----------------------------------------------------------------------===// 1313 1314SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1315 DAGCombinerInfo &DCI) const { 1316 SelectionDAG &DAG = DCI.DAG; 1317 1318 switch (N->getOpcode()) { 1319 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1320 case ISD::FP_ROUND: { 1321 SDValue Arg = N->getOperand(0); 1322 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1323 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1324 Arg.getOperand(0)); 1325 } 1326 break; 1327 } 1328 1329 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1330 // (i32 select_cc f32, f32, -1, 0 cc) 1331 // 1332 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1333 // this to one of the SET*_DX10 instructions. 1334 case ISD::FP_TO_SINT: { 1335 SDValue FNeg = N->getOperand(0); 1336 if (FNeg.getOpcode() != ISD::FNEG) { 1337 return SDValue(); 1338 } 1339 SDValue SelectCC = FNeg.getOperand(0); 1340 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1341 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1342 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1343 !isHWTrueValue(SelectCC.getOperand(2)) || 1344 !isHWFalseValue(SelectCC.getOperand(3))) { 1345 return SDValue(); 1346 } 1347 1348 return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), 1349 SelectCC.getOperand(0), // LHS 1350 SelectCC.getOperand(1), // RHS 1351 DAG.getConstant(-1, MVT::i32), // True 1352 DAG.getConstant(0, MVT::i32), // Flase 1353 SelectCC.getOperand(4)); // CC 1354 1355 break; 1356 } 1357 // Extract_vec (Build_vector) generated by custom lowering 1358 // also needs to be customly combined 1359 case ISD::EXTRACT_VECTOR_ELT: { 1360 SDValue Arg = N->getOperand(0); 1361 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1362 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1363 unsigned Element = Const->getZExtValue(); 1364 return Arg->getOperand(Element); 1365 } 1366 } 1367 if (Arg.getOpcode() == ISD::BITCAST && 1368 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 1369 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1370 unsigned Element = Const->getZExtValue(); 1371 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 1372 Arg->getOperand(0).getOperand(Element)); 1373 } 1374 } 1375 } 1376 1377 case ISD::SELECT_CC: { 1378 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1379 // selectcc x, y, a, b, inv(cc) 1380 // 1381 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1382 // selectcc x, y, a, b, cc 1383 SDValue LHS = N->getOperand(0); 1384 if (LHS.getOpcode() != ISD::SELECT_CC) { 1385 return SDValue(); 1386 } 1387 1388 SDValue RHS = N->getOperand(1); 1389 SDValue True = N->getOperand(2); 1390 SDValue False = N->getOperand(3); 1391 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1392 1393 if (LHS.getOperand(2).getNode() != True.getNode() || 1394 LHS.getOperand(3).getNode() != False.getNode() || 1395 RHS.getNode() != False.getNode()) { 1396 return SDValue(); 1397 } 1398 1399 switch (NCC) { 1400 default: return SDValue(); 1401 case ISD::SETNE: return LHS; 1402 case ISD::SETEQ: { 1403 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1404 LHSCC = ISD::getSetCCInverse(LHSCC, 1405 LHS.getOperand(0).getValueType().isInteger()); 1406 return DAG.getSelectCC(SDLoc(N), 1407 LHS.getOperand(0), 1408 LHS.getOperand(1), 1409 LHS.getOperand(2), 1410 LHS.getOperand(3), 1411 LHSCC); 1412 } 1413 } 1414 } 1415 case AMDGPUISD::EXPORT: { 1416 SDValue Arg = N->getOperand(1); 1417 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1418 break; 1419 1420 SDValue NewArgs[8] = { 1421 N->getOperand(0), // Chain 1422 SDValue(), 1423 N->getOperand(2), // ArrayBase 1424 N->getOperand(3), // Type 1425 N->getOperand(4), // SWZ_X 1426 N->getOperand(5), // SWZ_Y 1427 N->getOperand(6), // SWZ_Z 1428 N->getOperand(7) // SWZ_W 1429 }; 1430 SDLoc DL(N); 1431 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG); 1432 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8); 1433 } 1434 case AMDGPUISD::TEXTURE_FETCH: { 1435 SDValue Arg = N->getOperand(1); 1436 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1437 break; 1438 1439 SDValue NewArgs[19] = { 1440 N->getOperand(0), 1441 N->getOperand(1), 1442 N->getOperand(2), 1443 N->getOperand(3), 1444 N->getOperand(4), 1445 N->getOperand(5), 1446 N->getOperand(6), 1447 N->getOperand(7), 1448 N->getOperand(8), 1449 N->getOperand(9), 1450 N->getOperand(10), 1451 N->getOperand(11), 1452 N->getOperand(12), 1453 N->getOperand(13), 1454 N->getOperand(14), 1455 N->getOperand(15), 1456 N->getOperand(16), 1457 N->getOperand(17), 1458 N->getOperand(18), 1459 }; 1460 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG); 1461 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(), 1462 NewArgs, 19); 1463 } 1464 } 1465 return SDValue(); 1466} 1467