AMDGPUISelLowering.cpp revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief This is the parent TargetLowering class for hardware code gen 12/// targets. 13// 14//===----------------------------------------------------------------------===// 15 16#include "AMDGPUISelLowering.h" 17#include "AMDGPU.h" 18#include "AMDGPUFrameLowering.h" 19#include "AMDGPURegisterInfo.h" 20#include "AMDGPUSubtarget.h" 21#include "AMDILIntrinsicInfo.h" 22#include "R600MachineFunctionInfo.h" 23#include "SIMachineFunctionInfo.h" 24#include "llvm/Analysis/ValueTracking.h" 25#include "llvm/CodeGen/CallingConvLower.h" 26#include "llvm/CodeGen/MachineFunction.h" 27#include "llvm/CodeGen/MachineRegisterInfo.h" 28#include "llvm/CodeGen/SelectionDAG.h" 29#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 30#include "llvm/IR/DataLayout.h" 31 32using namespace llvm; 33static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, 34 CCValAssign::LocInfo LocInfo, 35 ISD::ArgFlagsTy ArgFlags, CCState &State) { 36 unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), 37 ArgFlags.getOrigAlign()); 38 State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); 39 40 return true; 41} 42 43#include "AMDGPUGenCallingConv.inc" 44 45AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : 46 TargetLowering(TM, new TargetLoweringObjectFileELF()) { 47 48 Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); 49 50 // Initialize target lowering borrowed from AMDIL 51 InitAMDILLowering(); 52 53 // We need to custom lower some of the intrinsics 54 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 55 56 // Library functions. These default to Expand, but we have instructions 57 // for them. 58 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 59 setOperationAction(ISD::FEXP2, MVT::f32, Legal); 60 setOperationAction(ISD::FPOW, MVT::f32, Legal); 61 setOperationAction(ISD::FLOG2, MVT::f32, Legal); 62 setOperationAction(ISD::FABS, MVT::f32, Legal); 63 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 64 setOperationAction(ISD::FRINT, MVT::f32, Legal); 65 setOperationAction(ISD::FROUND, MVT::f32, Legal); 66 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 67 68 // The hardware supports ROTR, but not ROTL 69 setOperationAction(ISD::ROTL, MVT::i32, Expand); 70 71 // Lower floating point store/load to integer store/load to reduce the number 72 // of patterns in tablegen. 73 setOperationAction(ISD::STORE, MVT::f32, Promote); 74 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 75 76 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 77 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 78 79 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 80 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 81 82 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 83 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 84 85 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 86 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 87 88 setOperationAction(ISD::STORE, MVT::f64, Promote); 89 AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); 90 91 // Custom lowering of vector stores is required for local address space 92 // stores. 93 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 94 // XXX: Native v2i32 local address space stores are possible, but not 95 // currently implemented. 96 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 97 98 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 99 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 100 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); 101 102 // XXX: This can be change to Custom, once ExpandVectorStores can 103 // handle 64-bit stores. 104 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); 105 106 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 107 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 108 setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); 109 110 111 setOperationAction(ISD::LOAD, MVT::f32, Promote); 112 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 113 114 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 115 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 116 117 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 118 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 119 120 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 121 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 122 123 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 124 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 125 126 setOperationAction(ISD::LOAD, MVT::f64, Promote); 127 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); 128 129 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 130 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); 131 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 132 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 133 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); 134 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); 135 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); 136 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); 137 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); 138 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); 139 140 setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand); 141 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand); 142 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand); 143 setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand); 144 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand); 145 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand); 146 setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand); 147 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand); 148 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand); 149 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand); 150 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand); 151 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand); 152 153 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 154 155 setOperationAction(ISD::FNEG, MVT::v2f32, Expand); 156 setOperationAction(ISD::FNEG, MVT::v4f32, Expand); 157 158 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 159 160 setOperationAction(ISD::MUL, MVT::i64, Expand); 161 162 setOperationAction(ISD::UDIV, MVT::i32, Expand); 163 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 164 setOperationAction(ISD::UREM, MVT::i32, Expand); 165 setOperationAction(ISD::VSELECT, MVT::v2f32, Expand); 166 setOperationAction(ISD::VSELECT, MVT::v4f32, Expand); 167 168 static const MVT::SimpleValueType IntTypes[] = { 169 MVT::v2i32, MVT::v4i32 170 }; 171 const size_t NumIntTypes = array_lengthof(IntTypes); 172 173 for (unsigned int x = 0; x < NumIntTypes; ++x) { 174 MVT::SimpleValueType VT = IntTypes[x]; 175 //Expand the following operations for the current type by default 176 setOperationAction(ISD::ADD, VT, Expand); 177 setOperationAction(ISD::AND, VT, Expand); 178 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 179 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 180 setOperationAction(ISD::MUL, VT, Expand); 181 setOperationAction(ISD::OR, VT, Expand); 182 setOperationAction(ISD::SHL, VT, Expand); 183 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 184 setOperationAction(ISD::SRL, VT, Expand); 185 setOperationAction(ISD::SRA, VT, Expand); 186 setOperationAction(ISD::SUB, VT, Expand); 187 setOperationAction(ISD::UDIV, VT, Expand); 188 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 189 setOperationAction(ISD::UREM, VT, Expand); 190 setOperationAction(ISD::SELECT, VT, Expand); 191 setOperationAction(ISD::VSELECT, VT, Expand); 192 setOperationAction(ISD::XOR, VT, Expand); 193 } 194 195 static const MVT::SimpleValueType FloatTypes[] = { 196 MVT::v2f32, MVT::v4f32 197 }; 198 const size_t NumFloatTypes = array_lengthof(FloatTypes); 199 200 for (unsigned int x = 0; x < NumFloatTypes; ++x) { 201 MVT::SimpleValueType VT = FloatTypes[x]; 202 setOperationAction(ISD::FABS, VT, Expand); 203 setOperationAction(ISD::FADD, VT, Expand); 204 setOperationAction(ISD::FDIV, VT, Expand); 205 setOperationAction(ISD::FPOW, VT, Expand); 206 setOperationAction(ISD::FFLOOR, VT, Expand); 207 setOperationAction(ISD::FTRUNC, VT, Expand); 208 setOperationAction(ISD::FMUL, VT, Expand); 209 setOperationAction(ISD::FRINT, VT, Expand); 210 setOperationAction(ISD::FSQRT, VT, Expand); 211 setOperationAction(ISD::FSUB, VT, Expand); 212 setOperationAction(ISD::SELECT, VT, Expand); 213 } 214 215 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); 216 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 217 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 218 219 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom); 220 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 221 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 222 223 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom); 224 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 225 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 226 227 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); 228 229 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 230} 231 232//===----------------------------------------------------------------------===// 233// Target Information 234//===----------------------------------------------------------------------===// 235 236MVT AMDGPUTargetLowering::getVectorIdxTy() const { 237 return MVT::i32; 238} 239 240bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, 241 EVT CastTy) const { 242 if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) 243 return true; 244 245 unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); 246 unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); 247 248 return ((LScalarSize <= CastScalarSize) || 249 (CastScalarSize >= 32) || 250 (LScalarSize < 32)); 251} 252 253//===---------------------------------------------------------------------===// 254// Target Properties 255//===---------------------------------------------------------------------===// 256 257bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 258 assert(VT.isFloatingPoint()); 259 return VT == MVT::f32; 260} 261 262bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 263 assert(VT.isFloatingPoint()); 264 return VT == MVT::f32; 265} 266 267bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 268 // Truncate is just accessing a subregister. 269 return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); 270} 271 272bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 273 // Truncate is just accessing a subregister. 274 return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && 275 (Dest->getPrimitiveSizeInBits() % 32 == 0); 276} 277 278bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 279 const DataLayout *DL = getDataLayout(); 280 unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); 281 unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); 282 283 return SrcSize == 32 && DestSize == 64; 284} 285 286bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 287 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 288 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 289 // this will enable reducing 64-bit operations the 32-bit, which is always 290 // good. 291 return Src == MVT::i32 && Dest == MVT::i64; 292} 293 294bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { 295 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 296 // limited number of native 64-bit operations. Shrinking an operation to fit 297 // in a single 32-bit register should always be helpful. As currently used, 298 // this is much less general than the name suggests, and is only used in 299 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 300 // not profitable, and may actually be harmful. 301 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 302} 303 304//===---------------------------------------------------------------------===// 305// TargetLowering Callbacks 306//===---------------------------------------------------------------------===// 307 308void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, 309 const SmallVectorImpl<ISD::InputArg> &Ins) const { 310 311 State.AnalyzeFormalArguments(Ins, CC_AMDGPU); 312} 313 314SDValue AMDGPUTargetLowering::LowerReturn( 315 SDValue Chain, 316 CallingConv::ID CallConv, 317 bool isVarArg, 318 const SmallVectorImpl<ISD::OutputArg> &Outs, 319 const SmallVectorImpl<SDValue> &OutVals, 320 SDLoc DL, SelectionDAG &DAG) const { 321 return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); 322} 323 324//===---------------------------------------------------------------------===// 325// Target specific lowering 326//===---------------------------------------------------------------------===// 327 328SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) 329 const { 330 switch (Op.getOpcode()) { 331 default: 332 Op.getNode()->dump(); 333 llvm_unreachable("Custom lowering code for this" 334 "instruction is not implemented yet!"); 335 break; 336 // AMDIL DAG lowering 337 case ISD::SDIV: return LowerSDIV(Op, DAG); 338 case ISD::SREM: return LowerSREM(Op, DAG); 339 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 340 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 341 // AMDGPU DAG lowering 342 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 343 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 344 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 345 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 346 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 347 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 348 } 349 return Op; 350} 351 352void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 353 SmallVectorImpl<SDValue> &Results, 354 SelectionDAG &DAG) const { 355 switch (N->getOpcode()) { 356 case ISD::SIGN_EXTEND_INREG: 357 // Different parts of legalization seem to interpret which type of 358 // sign_extend_inreg is the one to check for custom lowering. The extended 359 // from type is what really matters, but some places check for custom 360 // lowering of the result type. This results in trying to use 361 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 362 // nothing here and let the illegal result integer be handled normally. 363 return; 364 365 default: 366 return; 367 } 368} 369 370SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, 371 const GlobalValue *GV, 372 const SDValue &InitPtr, 373 SDValue Chain, 374 SelectionDAG &DAG) const { 375 const DataLayout *TD = getTargetMachine().getDataLayout(); 376 SDLoc DL(InitPtr); 377 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { 378 EVT VT = EVT::getEVT(CI->getType()); 379 PointerType *PtrTy = PointerType::get(CI->getType(), 0); 380 return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, 381 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 382 TD->getPrefTypeAlignment(CI->getType())); 383 } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { 384 EVT VT = EVT::getEVT(CFP->getType()); 385 PointerType *PtrTy = PointerType::get(CFP->getType(), 0); 386 return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr, 387 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 388 TD->getPrefTypeAlignment(CFP->getType())); 389 } else if (Init->getType()->isAggregateType()) { 390 EVT PtrVT = InitPtr.getValueType(); 391 unsigned NumElements = Init->getType()->getArrayNumElements(); 392 SmallVector<SDValue, 8> Chains; 393 for (unsigned i = 0; i < NumElements; ++i) { 394 SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize( 395 Init->getType()->getArrayElementType()), PtrVT); 396 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); 397 Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i), 398 GV, Ptr, Chain, DAG)); 399 } 400 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0], 401 Chains.size()); 402 } else { 403 Init->dump(); 404 llvm_unreachable("Unhandled constant initializer"); 405 } 406} 407 408SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 409 SDValue Op, 410 SelectionDAG &DAG) const { 411 412 const DataLayout *TD = getTargetMachine().getDataLayout(); 413 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 414 const GlobalValue *GV = G->getGlobal(); 415 416 switch (G->getAddressSpace()) { 417 default: llvm_unreachable("Global Address lowering not implemented for this " 418 "address space"); 419 case AMDGPUAS::LOCAL_ADDRESS: { 420 // XXX: What does the value of G->getOffset() mean? 421 assert(G->getOffset() == 0 && 422 "Do not know what to do with an non-zero offset"); 423 424 unsigned Offset; 425 if (MFI->LocalMemoryObjects.count(GV) == 0) { 426 uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); 427 Offset = MFI->LDSSize; 428 MFI->LocalMemoryObjects[GV] = Offset; 429 // XXX: Account for alignment? 430 MFI->LDSSize += Size; 431 } else { 432 Offset = MFI->LocalMemoryObjects[GV]; 433 } 434 435 return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); 436 } 437 case AMDGPUAS::CONSTANT_ADDRESS: { 438 MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); 439 Type *EltType = GV->getType()->getElementType(); 440 unsigned Size = TD->getTypeAllocSize(EltType); 441 unsigned Alignment = TD->getPrefTypeAlignment(EltType); 442 443 const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV); 444 const Constant *Init = Var->getInitializer(); 445 int FI = FrameInfo->CreateStackObject(Size, Alignment, false); 446 SDValue InitPtr = DAG.getFrameIndex(FI, 447 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); 448 SmallVector<SDNode*, 8> WorkList; 449 450 for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), 451 E = DAG.getEntryNode()->use_end(); I != E; ++I) { 452 if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) 453 continue; 454 WorkList.push_back(*I); 455 } 456 SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); 457 for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(), 458 E = WorkList.end(); I != E; ++I) { 459 SmallVector<SDValue, 8> Ops; 460 Ops.push_back(Chain); 461 for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { 462 Ops.push_back((*I)->getOperand(i)); 463 } 464 DAG.UpdateNodeOperands(*I, &Ops[0], Ops.size()); 465 } 466 return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), 467 getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); 468 } 469 } 470} 471 472void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG, 473 SmallVectorImpl<SDValue> &Args, 474 unsigned Start, 475 unsigned Count) const { 476 EVT VT = Op.getValueType(); 477 for (unsigned i = Start, e = Start + Count; i != e; ++i) { 478 Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), 479 VT.getVectorElementType(), 480 Op, DAG.getConstant(i, MVT::i32))); 481 } 482} 483 484SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 485 SelectionDAG &DAG) const { 486 SmallVector<SDValue, 8> Args; 487 SDValue A = Op.getOperand(0); 488 SDValue B = Op.getOperand(1); 489 490 ExtractVectorElements(A, DAG, Args, 0, 491 A.getValueType().getVectorNumElements()); 492 ExtractVectorElements(B, DAG, Args, 0, 493 B.getValueType().getVectorNumElements()); 494 495 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), 496 &Args[0], Args.size()); 497} 498 499SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 500 SelectionDAG &DAG) const { 501 502 SmallVector<SDValue, 8> Args; 503 EVT VT = Op.getValueType(); 504 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 505 ExtractVectorElements(Op.getOperand(0), DAG, Args, Start, 506 VT.getVectorNumElements()); 507 508 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), 509 &Args[0], Args.size()); 510} 511 512SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, 513 SelectionDAG &DAG) const { 514 515 MachineFunction &MF = DAG.getMachineFunction(); 516 const AMDGPUFrameLowering *TFL = 517 static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); 518 519 FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); 520 assert(FIN); 521 522 unsigned FrameIndex = FIN->getIndex(); 523 unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); 524 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), 525 Op.getValueType()); 526} 527 528SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 529 SelectionDAG &DAG) const { 530 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 531 SDLoc DL(Op); 532 EVT VT = Op.getValueType(); 533 534 switch (IntrinsicID) { 535 default: return Op; 536 case AMDGPUIntrinsic::AMDIL_abs: 537 return LowerIntrinsicIABS(Op, DAG); 538 case AMDGPUIntrinsic::AMDIL_exp: 539 return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); 540 case AMDGPUIntrinsic::AMDGPU_lrp: 541 return LowerIntrinsicLRP(Op, DAG); 542 case AMDGPUIntrinsic::AMDIL_fraction: 543 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 544 case AMDGPUIntrinsic::AMDIL_max: 545 return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1), 546 Op.getOperand(2)); 547 case AMDGPUIntrinsic::AMDGPU_imax: 548 return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), 549 Op.getOperand(2)); 550 case AMDGPUIntrinsic::AMDGPU_umax: 551 return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), 552 Op.getOperand(2)); 553 case AMDGPUIntrinsic::AMDIL_min: 554 return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1), 555 Op.getOperand(2)); 556 case AMDGPUIntrinsic::AMDGPU_imin: 557 return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), 558 Op.getOperand(2)); 559 case AMDGPUIntrinsic::AMDGPU_umin: 560 return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), 561 Op.getOperand(2)); 562 563 case AMDGPUIntrinsic::AMDGPU_bfe_i32: 564 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, 565 Op.getOperand(1), 566 Op.getOperand(2), 567 Op.getOperand(3)); 568 569 case AMDGPUIntrinsic::AMDGPU_bfe_u32: 570 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, 571 Op.getOperand(1), 572 Op.getOperand(2), 573 Op.getOperand(3)); 574 575 case AMDGPUIntrinsic::AMDGPU_bfi: 576 return DAG.getNode(AMDGPUISD::BFI, DL, VT, 577 Op.getOperand(1), 578 Op.getOperand(2), 579 Op.getOperand(3)); 580 581 case AMDGPUIntrinsic::AMDGPU_bfm: 582 return DAG.getNode(AMDGPUISD::BFM, DL, VT, 583 Op.getOperand(1), 584 Op.getOperand(2)); 585 586 case AMDGPUIntrinsic::AMDIL_round_nearest: 587 return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); 588 } 589} 590 591///IABS(a) = SMAX(sub(0, a), a) 592SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, 593 SelectionDAG &DAG) const { 594 595 SDLoc DL(Op); 596 EVT VT = Op.getValueType(); 597 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 598 Op.getOperand(1)); 599 600 return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1)); 601} 602 603/// Linear Interpolation 604/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) 605SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, 606 SelectionDAG &DAG) const { 607 SDLoc DL(Op); 608 EVT VT = Op.getValueType(); 609 SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, 610 DAG.getConstantFP(1.0f, MVT::f32), 611 Op.getOperand(1)); 612 SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, 613 Op.getOperand(3)); 614 return DAG.getNode(ISD::FADD, DL, VT, 615 DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), 616 OneSubAC); 617} 618 619/// \brief Generate Min/Max node 620SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, 621 SelectionDAG &DAG) const { 622 SDLoc DL(Op); 623 EVT VT = Op.getValueType(); 624 625 SDValue LHS = Op.getOperand(0); 626 SDValue RHS = Op.getOperand(1); 627 SDValue True = Op.getOperand(2); 628 SDValue False = Op.getOperand(3); 629 SDValue CC = Op.getOperand(4); 630 631 if (VT != MVT::f32 || 632 !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { 633 return SDValue(); 634 } 635 636 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 637 switch (CCOpcode) { 638 case ISD::SETOEQ: 639 case ISD::SETONE: 640 case ISD::SETUNE: 641 case ISD::SETNE: 642 case ISD::SETUEQ: 643 case ISD::SETEQ: 644 case ISD::SETFALSE: 645 case ISD::SETFALSE2: 646 case ISD::SETTRUE: 647 case ISD::SETTRUE2: 648 case ISD::SETUO: 649 case ISD::SETO: 650 llvm_unreachable("Operation should already be optimised!"); 651 case ISD::SETULE: 652 case ISD::SETULT: 653 case ISD::SETOLE: 654 case ISD::SETOLT: 655 case ISD::SETLE: 656 case ISD::SETLT: { 657 if (LHS == True) 658 return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); 659 else 660 return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS); 661 } 662 case ISD::SETGT: 663 case ISD::SETGE: 664 case ISD::SETUGE: 665 case ISD::SETOGE: 666 case ISD::SETUGT: 667 case ISD::SETOGT: { 668 if (LHS == True) 669 return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS); 670 else 671 return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); 672 } 673 case ISD::SETCC_INVALID: 674 llvm_unreachable("Invalid setcc condcode!"); 675 } 676 return Op; 677} 678 679SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, 680 SelectionDAG &DAG) const { 681 LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); 682 EVT MemEltVT = Load->getMemoryVT().getVectorElementType(); 683 EVT EltVT = Op.getValueType().getVectorElementType(); 684 EVT PtrVT = Load->getBasePtr().getValueType(); 685 unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); 686 SmallVector<SDValue, 8> Loads; 687 SDLoc SL(Op); 688 689 for (unsigned i = 0, e = NumElts; i != e; ++i) { 690 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), 691 DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT)); 692 Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, 693 Load->getChain(), Ptr, 694 MachinePointerInfo(Load->getMemOperand()->getValue()), 695 MemEltVT, Load->isVolatile(), Load->isNonTemporal(), 696 Load->getAlignment())); 697 } 698 return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), 699 Loads.data(), Loads.size()); 700} 701 702SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, 703 SelectionDAG &DAG) const { 704 StoreSDNode *Store = dyn_cast<StoreSDNode>(Op); 705 EVT MemVT = Store->getMemoryVT(); 706 unsigned MemBits = MemVT.getSizeInBits(); 707 708 // Byte stores are really expensive, so if possible, try to pack 32-bit vector 709 // truncating store into an i32 store. 710 // XXX: We could also handle optimize other vector bitwidths. 711 if (!MemVT.isVector() || MemBits > 32) { 712 return SDValue(); 713 } 714 715 SDLoc DL(Op); 716 const SDValue &Value = Store->getValue(); 717 EVT VT = Value.getValueType(); 718 const SDValue &Ptr = Store->getBasePtr(); 719 EVT MemEltVT = MemVT.getVectorElementType(); 720 unsigned MemEltBits = MemEltVT.getSizeInBits(); 721 unsigned MemNumElements = MemVT.getVectorNumElements(); 722 EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); 723 SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, PackedVT); 724 725 SDValue PackedValue; 726 for (unsigned i = 0; i < MemNumElements; ++i) { 727 EVT ElemVT = VT.getVectorElementType(); 728 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, 729 DAG.getConstant(i, MVT::i32)); 730 Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT); 731 Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask); 732 SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT); 733 Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift); 734 if (i == 0) { 735 PackedValue = Elt; 736 } else { 737 PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt); 738 } 739 } 740 return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, 741 MachinePointerInfo(Store->getMemOperand()->getValue()), 742 Store->isVolatile(), Store->isNonTemporal(), 743 Store->getAlignment()); 744} 745 746SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 747 SelectionDAG &DAG) const { 748 StoreSDNode *Store = cast<StoreSDNode>(Op); 749 EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); 750 EVT EltVT = Store->getValue().getValueType().getVectorElementType(); 751 EVT PtrVT = Store->getBasePtr().getValueType(); 752 unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); 753 SDLoc SL(Op); 754 755 SmallVector<SDValue, 8> Chains; 756 757 for (unsigned i = 0, e = NumElts; i != e; ++i) { 758 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, 759 Store->getValue(), DAG.getConstant(i, MVT::i32)); 760 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, 761 Store->getBasePtr(), 762 DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), 763 PtrVT)); 764 Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, 765 MachinePointerInfo(Store->getMemOperand()->getValue()), 766 MemEltVT, Store->isVolatile(), Store->isNonTemporal(), 767 Store->getAlignment())); 768 } 769 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts); 770} 771 772SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 773 SDLoc DL(Op); 774 LoadSDNode *Load = cast<LoadSDNode>(Op); 775 ISD::LoadExtType ExtType = Load->getExtensionType(); 776 EVT VT = Op.getValueType(); 777 EVT MemVT = Load->getMemoryVT(); 778 779 if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) { 780 // We can do the extload to 32-bits, and then need to separately extend to 781 // 64-bits. 782 783 SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32, 784 Load->getChain(), 785 Load->getBasePtr(), 786 MemVT, 787 Load->getMemOperand()); 788 return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32); 789 } 790 791 // Lower loads constant address space global variable loads 792 if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && 793 isa<GlobalVariable>(GetUnderlyingObject(Load->getPointerInfo().V))) { 794 795 SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, 796 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); 797 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 798 DAG.getConstant(2, MVT::i32)); 799 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 800 Load->getChain(), Ptr, 801 DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); 802 } 803 804 if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || 805 ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) 806 return SDValue(); 807 808 809 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), 810 DAG.getConstant(2, MVT::i32)); 811 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 812 Load->getChain(), Ptr, 813 DAG.getTargetConstant(0, MVT::i32), 814 Op.getOperand(2)); 815 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 816 Load->getBasePtr(), 817 DAG.getConstant(0x3, MVT::i32)); 818 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 819 DAG.getConstant(3, MVT::i32)); 820 821 Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); 822 823 EVT MemEltVT = MemVT.getScalarType(); 824 if (ExtType == ISD::SEXTLOAD) { 825 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 826 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); 827 } 828 829 return DAG.getZeroExtendInReg(Ret, DL, MemEltVT); 830} 831 832SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 833 SDLoc DL(Op); 834 SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); 835 if (Result.getNode()) { 836 return Result; 837 } 838 839 StoreSDNode *Store = cast<StoreSDNode>(Op); 840 SDValue Chain = Store->getChain(); 841 if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 842 Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && 843 Store->getValue().getValueType().isVector()) { 844 return SplitVectorStore(Op, DAG); 845 } 846 847 EVT MemVT = Store->getMemoryVT(); 848 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && 849 MemVT.bitsLT(MVT::i32)) { 850 unsigned Mask = 0; 851 if (Store->getMemoryVT() == MVT::i8) { 852 Mask = 0xff; 853 } else if (Store->getMemoryVT() == MVT::i16) { 854 Mask = 0xffff; 855 } 856 SDValue BasePtr = Store->getBasePtr(); 857 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, 858 DAG.getConstant(2, MVT::i32)); 859 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 860 Chain, Ptr, DAG.getTargetConstant(0, MVT::i32)); 861 862 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, 863 DAG.getConstant(0x3, MVT::i32)); 864 865 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 866 DAG.getConstant(3, MVT::i32)); 867 868 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 869 Store->getValue()); 870 871 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 872 873 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 874 MaskedValue, ShiftAmt); 875 876 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32), 877 ShiftAmt); 878 DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, 879 DAG.getConstant(0xffffffff, MVT::i32)); 880 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 881 882 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 883 return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 884 Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32)); 885 } 886 return SDValue(); 887} 888 889SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 890 SelectionDAG &DAG) const { 891 SDLoc DL(Op); 892 EVT VT = Op.getValueType(); 893 894 SDValue Num = Op.getOperand(0); 895 SDValue Den = Op.getOperand(1); 896 897 SmallVector<SDValue, 8> Results; 898 899 // RCP = URECIP(Den) = 2^32 / Den + e 900 // e is rounding error. 901 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); 902 903 // RCP_LO = umulo(RCP, Den) */ 904 SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den); 905 906 // RCP_HI = mulhu (RCP, Den) */ 907 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); 908 909 // NEG_RCP_LO = -RCP_LO 910 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 911 RCP_LO); 912 913 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 914 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), 915 NEG_RCP_LO, RCP_LO, 916 ISD::SETEQ); 917 // Calculate the rounding error from the URECIP instruction 918 // E = mulhu(ABS_RCP_LO, RCP) 919 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); 920 921 // RCP_A_E = RCP + E 922 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); 923 924 // RCP_S_E = RCP - E 925 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); 926 927 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 928 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), 929 RCP_A_E, RCP_S_E, 930 ISD::SETEQ); 931 // Quotient = mulhu(Tmp0, Num) 932 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); 933 934 // Num_S_Remainder = Quotient * Den 935 SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den); 936 937 // Remainder = Num - Num_S_Remainder 938 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); 939 940 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 941 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, 942 DAG.getConstant(-1, VT), 943 DAG.getConstant(0, VT), 944 ISD::SETUGE); 945 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 946 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, 947 Num_S_Remainder, 948 DAG.getConstant(-1, VT), 949 DAG.getConstant(0, VT), 950 ISD::SETUGE); 951 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 952 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, 953 Remainder_GE_Zero); 954 955 // Calculate Division result: 956 957 // Quotient_A_One = Quotient + 1 958 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, 959 DAG.getConstant(1, VT)); 960 961 // Quotient_S_One = Quotient - 1 962 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, 963 DAG.getConstant(1, VT)); 964 965 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 966 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), 967 Quotient, Quotient_A_One, ISD::SETEQ); 968 969 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 970 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), 971 Quotient_S_One, Div, ISD::SETEQ); 972 973 // Calculate Rem result: 974 975 // Remainder_S_Den = Remainder - Den 976 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); 977 978 // Remainder_A_Den = Remainder + Den 979 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); 980 981 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 982 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), 983 Remainder, Remainder_S_Den, ISD::SETEQ); 984 985 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 986 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), 987 Remainder_A_Den, Rem, ISD::SETEQ); 988 SDValue Ops[2]; 989 Ops[0] = Div; 990 Ops[1] = Rem; 991 return DAG.getMergeValues(Ops, 2, DL); 992} 993 994SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 995 SelectionDAG &DAG) const { 996 SDValue S0 = Op.getOperand(0); 997 SDLoc DL(Op); 998 if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64) 999 return SDValue(); 1000 1001 // f32 uint_to_fp i64 1002 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, 1003 DAG.getConstant(0, MVT::i32)); 1004 SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); 1005 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, 1006 DAG.getConstant(1, MVT::i32)); 1007 SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); 1008 FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, 1009 DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32 1010 return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); 1011 1012} 1013 1014SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, 1015 unsigned BitsDiff, 1016 SelectionDAG &DAG) const { 1017 MVT VT = Op.getSimpleValueType(); 1018 SDLoc DL(Op); 1019 SDValue Shift = DAG.getConstant(BitsDiff, VT); 1020 // Shift left by 'Shift' bits. 1021 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift); 1022 // Signed shift Right by 'Shift' bits. 1023 return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift); 1024} 1025 1026SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 1027 SelectionDAG &DAG) const { 1028 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 1029 MVT VT = Op.getSimpleValueType(); 1030 MVT ScalarVT = VT.getScalarType(); 1031 1032 unsigned SrcBits = ExtraVT.getScalarType().getSizeInBits(); 1033 unsigned DestBits = ScalarVT.getSizeInBits(); 1034 unsigned BitsDiff = DestBits - SrcBits; 1035 1036 if (!Subtarget->hasBFE()) 1037 return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG); 1038 1039 SDValue Src = Op.getOperand(0); 1040 if (VT.isVector()) { 1041 SDLoc DL(Op); 1042 // Need to scalarize this, and revisit each of the scalars later. 1043 // TODO: Don't scalarize on Evergreen? 1044 unsigned NElts = VT.getVectorNumElements(); 1045 SmallVector<SDValue, 8> Args; 1046 ExtractVectorElements(Src, DAG, Args, 0, NElts); 1047 1048 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 1049 for (unsigned I = 0; I < NElts; ++I) 1050 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 1051 1052 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size()); 1053 } 1054 1055 if (SrcBits == 32) { 1056 SDLoc DL(Op); 1057 1058 // If the source is 32-bits, this is really half of a 2-register pair, and 1059 // we need to discard the unused half of the pair. 1060 SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src); 1061 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, TruncSrc); 1062 } 1063 1064 unsigned NElts = VT.isVector() ? VT.getVectorNumElements() : 1; 1065 1066 // TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it 1067 // might not be worth the effort, and will need to expand to shifts when 1068 // fixing SGPR copies. 1069 if (SrcBits < 32 && DestBits <= 32) { 1070 SDLoc DL(Op); 1071 MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts); 1072 1073 if (DestBits != 32) 1074 Src = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Src); 1075 1076 // FIXME: This should use TargetConstant, but that hits assertions for 1077 // Evergreen. 1078 SDValue Ext = DAG.getNode(AMDGPUISD::BFE_I32, DL, ExtVT, 1079 Op.getOperand(0), // Operand 1080 DAG.getConstant(0, ExtVT), // Offset 1081 DAG.getConstant(SrcBits, ExtVT)); // Width 1082 1083 // Truncate to the original type if necessary. 1084 if (ScalarVT == MVT::i32) 1085 return Ext; 1086 return DAG.getNode(ISD::TRUNCATE, DL, VT, Ext); 1087 } 1088 1089 // For small types, extend to 32-bits first. 1090 if (SrcBits < 32) { 1091 SDLoc DL(Op); 1092 MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts); 1093 1094 SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, Src); 1095 SDValue Ext32 = DAG.getNode(AMDGPUISD::BFE_I32, 1096 DL, 1097 ExtVT, 1098 TruncSrc, // Operand 1099 DAG.getConstant(0, ExtVT), // Offset 1100 DAG.getConstant(SrcBits, ExtVT)); // Width 1101 1102 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Ext32); 1103 } 1104 1105 // For everything else, use the standard bitshift expansion. 1106 return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG); 1107} 1108 1109//===----------------------------------------------------------------------===// 1110// Helper functions 1111//===----------------------------------------------------------------------===// 1112 1113void AMDGPUTargetLowering::getOriginalFunctionArgs( 1114 SelectionDAG &DAG, 1115 const Function *F, 1116 const SmallVectorImpl<ISD::InputArg> &Ins, 1117 SmallVectorImpl<ISD::InputArg> &OrigIns) const { 1118 1119 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1120 if (Ins[i].ArgVT == Ins[i].VT) { 1121 OrigIns.push_back(Ins[i]); 1122 continue; 1123 } 1124 1125 EVT VT; 1126 if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { 1127 // Vector has been split into scalars. 1128 VT = Ins[i].ArgVT.getVectorElementType(); 1129 } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && 1130 Ins[i].ArgVT.getVectorElementType() != 1131 Ins[i].VT.getVectorElementType()) { 1132 // Vector elements have been promoted 1133 VT = Ins[i].ArgVT; 1134 } else { 1135 // Vector has been spilt into smaller vectors. 1136 VT = Ins[i].VT; 1137 } 1138 1139 ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, 1140 Ins[i].OrigArgIndex, Ins[i].PartOffset); 1141 OrigIns.push_back(Arg); 1142 } 1143} 1144 1145bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { 1146 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 1147 return CFP->isExactlyValue(1.0); 1148 } 1149 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 1150 return C->isAllOnesValue(); 1151 } 1152 return false; 1153} 1154 1155bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { 1156 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 1157 return CFP->getValueAPF().isZero(); 1158 } 1159 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 1160 return C->isNullValue(); 1161 } 1162 return false; 1163} 1164 1165SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 1166 const TargetRegisterClass *RC, 1167 unsigned Reg, EVT VT) const { 1168 MachineFunction &MF = DAG.getMachineFunction(); 1169 MachineRegisterInfo &MRI = MF.getRegInfo(); 1170 unsigned VirtualRegister; 1171 if (!MRI.isLiveIn(Reg)) { 1172 VirtualRegister = MRI.createVirtualRegister(RC); 1173 MRI.addLiveIn(Reg, VirtualRegister); 1174 } else { 1175 VirtualRegister = MRI.getLiveInVirtReg(Reg); 1176 } 1177 return DAG.getRegister(VirtualRegister, VT); 1178} 1179 1180#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 1181 1182const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 1183 switch (Opcode) { 1184 default: return 0; 1185 // AMDIL DAG nodes 1186 NODE_NAME_CASE(CALL); 1187 NODE_NAME_CASE(UMUL); 1188 NODE_NAME_CASE(DIV_INF); 1189 NODE_NAME_CASE(RET_FLAG); 1190 NODE_NAME_CASE(BRANCH_COND); 1191 1192 // AMDGPU DAG nodes 1193 NODE_NAME_CASE(DWORDADDR) 1194 NODE_NAME_CASE(FRACT) 1195 NODE_NAME_CASE(FMAX) 1196 NODE_NAME_CASE(SMAX) 1197 NODE_NAME_CASE(UMAX) 1198 NODE_NAME_CASE(FMIN) 1199 NODE_NAME_CASE(SMIN) 1200 NODE_NAME_CASE(UMIN) 1201 NODE_NAME_CASE(BFE_U32) 1202 NODE_NAME_CASE(BFE_I32) 1203 NODE_NAME_CASE(BFI) 1204 NODE_NAME_CASE(BFM) 1205 NODE_NAME_CASE(URECIP) 1206 NODE_NAME_CASE(DOT4) 1207 NODE_NAME_CASE(EXPORT) 1208 NODE_NAME_CASE(CONST_ADDRESS) 1209 NODE_NAME_CASE(REGISTER_LOAD) 1210 NODE_NAME_CASE(REGISTER_STORE) 1211 NODE_NAME_CASE(LOAD_CONSTANT) 1212 NODE_NAME_CASE(LOAD_INPUT) 1213 NODE_NAME_CASE(SAMPLE) 1214 NODE_NAME_CASE(SAMPLEB) 1215 NODE_NAME_CASE(SAMPLED) 1216 NODE_NAME_CASE(SAMPLEL) 1217 NODE_NAME_CASE(STORE_MSKOR) 1218 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 1219 } 1220} 1221 1222static void computeMaskedBitsForMinMax(const SDValue Op0, 1223 const SDValue Op1, 1224 APInt &KnownZero, 1225 APInt &KnownOne, 1226 const SelectionDAG &DAG, 1227 unsigned Depth) { 1228 APInt Op0Zero, Op0One; 1229 APInt Op1Zero, Op1One; 1230 DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth); 1231 DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth); 1232 1233 KnownZero = Op0Zero & Op1Zero; 1234 KnownOne = Op0One & Op1One; 1235} 1236 1237void AMDGPUTargetLowering::computeMaskedBitsForTargetNode( 1238 const SDValue Op, 1239 APInt &KnownZero, 1240 APInt &KnownOne, 1241 const SelectionDAG &DAG, 1242 unsigned Depth) const { 1243 1244 KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. 1245 unsigned Opc = Op.getOpcode(); 1246 switch (Opc) { 1247 case ISD::INTRINSIC_WO_CHAIN: { 1248 // FIXME: The intrinsic should just use the node. 1249 switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { 1250 case AMDGPUIntrinsic::AMDGPU_imax: 1251 case AMDGPUIntrinsic::AMDGPU_umax: 1252 case AMDGPUIntrinsic::AMDGPU_imin: 1253 case AMDGPUIntrinsic::AMDGPU_umin: 1254 computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2), 1255 KnownZero, KnownOne, DAG, Depth); 1256 break; 1257 default: 1258 break; 1259 } 1260 1261 break; 1262 } 1263 case AMDGPUISD::SMAX: 1264 case AMDGPUISD::UMAX: 1265 case AMDGPUISD::SMIN: 1266 case AMDGPUISD::UMIN: 1267 computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1), 1268 KnownZero, KnownOne, DAG, Depth); 1269 break; 1270 default: 1271 break; 1272 } 1273} 1274