AMDGPUISelLowering.cpp revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief This is the parent TargetLowering class for hardware code gen 12/// targets. 13// 14//===----------------------------------------------------------------------===// 15 16#include "AMDGPUISelLowering.h" 17#include "AMDGPU.h" 18#include "AMDGPUFrameLowering.h" 19#include "AMDGPURegisterInfo.h" 20#include "AMDGPUSubtarget.h" 21#include "AMDILIntrinsicInfo.h" 22#include "R600MachineFunctionInfo.h" 23#include "SIMachineFunctionInfo.h" 24#include "llvm/Analysis/ValueTracking.h" 25#include "llvm/CodeGen/CallingConvLower.h" 26#include "llvm/CodeGen/MachineFunction.h" 27#include "llvm/CodeGen/MachineRegisterInfo.h" 28#include "llvm/CodeGen/SelectionDAG.h" 29#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 30#include "llvm/IR/DataLayout.h" 31#include "llvm/IR/DiagnosticInfo.h" 32#include "llvm/IR/DiagnosticPrinter.h" 33 34using namespace llvm; 35 36namespace { 37 38/// Diagnostic information for unimplemented or unsupported feature reporting. 39class DiagnosticInfoUnsupported : public DiagnosticInfo { 40private: 41 const Twine &Description; 42 const Function &Fn; 43 44 static int KindID; 45 46 static int getKindID() { 47 if (KindID == 0) 48 KindID = llvm::getNextAvailablePluginDiagnosticKind(); 49 return KindID; 50 } 51 52public: 53 DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, 54 DiagnosticSeverity Severity = DS_Error) 55 : DiagnosticInfo(getKindID(), Severity), 56 Description(Desc), 57 Fn(Fn) { } 58 59 const Function &getFunction() const { return Fn; } 60 const Twine &getDescription() const { return Description; } 61 62 void print(DiagnosticPrinter &DP) const override { 63 DP << "unsupported " << getDescription() << " in " << Fn.getName(); 64 } 65 66 static bool classof(const DiagnosticInfo *DI) { 67 return DI->getKind() == getKindID(); 68 } 69}; 70 71int DiagnosticInfoUnsupported::KindID = 0; 72} 73 74 75static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, 76 CCValAssign::LocInfo LocInfo, 77 ISD::ArgFlagsTy ArgFlags, CCState &State) { 78 unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), 79 ArgFlags.getOrigAlign()); 80 State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); 81 82 return true; 83} 84 85#include "AMDGPUGenCallingConv.inc" 86 87AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : 88 TargetLowering(TM, new TargetLoweringObjectFileELF()) { 89 90 Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); 91 92 // Initialize target lowering borrowed from AMDIL 93 InitAMDILLowering(); 94 95 // We need to custom lower some of the intrinsics 96 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 97 98 // Library functions. These default to Expand, but we have instructions 99 // for them. 100 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 101 setOperationAction(ISD::FEXP2, MVT::f32, Legal); 102 setOperationAction(ISD::FPOW, MVT::f32, Legal); 103 setOperationAction(ISD::FLOG2, MVT::f32, Legal); 104 setOperationAction(ISD::FABS, MVT::f32, Legal); 105 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 106 setOperationAction(ISD::FRINT, MVT::f32, Legal); 107 setOperationAction(ISD::FROUND, MVT::f32, Legal); 108 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 109 110 // The hardware supports ROTR, but not ROTL 111 setOperationAction(ISD::ROTL, MVT::i32, Expand); 112 113 // Lower floating point store/load to integer store/load to reduce the number 114 // of patterns in tablegen. 115 setOperationAction(ISD::STORE, MVT::f32, Promote); 116 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 117 118 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 119 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 120 121 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 122 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 123 124 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 125 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 126 127 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 128 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 129 130 setOperationAction(ISD::STORE, MVT::f64, Promote); 131 AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); 132 133 setOperationAction(ISD::STORE, MVT::v2f64, Promote); 134 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); 135 136 // Custom lowering of vector stores is required for local address space 137 // stores. 138 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 139 // XXX: Native v2i32 local address space stores are possible, but not 140 // currently implemented. 141 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 142 143 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 144 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 145 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); 146 147 // XXX: This can be change to Custom, once ExpandVectorStores can 148 // handle 64-bit stores. 149 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); 150 151 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 152 setTruncStoreAction(MVT::i64, MVT::i8, Expand); 153 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 154 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 155 setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); 156 157 158 setOperationAction(ISD::LOAD, MVT::f32, Promote); 159 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 160 161 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 162 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 163 164 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 165 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 166 167 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 168 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 169 170 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 171 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 172 173 setOperationAction(ISD::LOAD, MVT::f64, Promote); 174 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); 175 176 setOperationAction(ISD::LOAD, MVT::v2f64, Promote); 177 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); 178 179 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 180 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); 181 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 182 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 183 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); 184 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); 185 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); 186 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); 187 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); 188 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); 189 190 setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand); 191 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand); 192 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand); 193 setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand); 194 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand); 195 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand); 196 setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand); 197 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand); 198 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand); 199 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand); 200 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand); 201 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand); 202 203 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 204 205 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 206 207 setOperationAction(ISD::FNEG, MVT::v2f32, Expand); 208 setOperationAction(ISD::FNEG, MVT::v4f32, Expand); 209 210 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 211 212 setOperationAction(ISD::MUL, MVT::i64, Expand); 213 setOperationAction(ISD::SUB, MVT::i64, Expand); 214 215 setOperationAction(ISD::UDIV, MVT::i32, Expand); 216 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 217 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 218 setOperationAction(ISD::UREM, MVT::i32, Expand); 219 setOperationAction(ISD::VSELECT, MVT::v2f32, Expand); 220 setOperationAction(ISD::VSELECT, MVT::v4f32, Expand); 221 222 static const MVT::SimpleValueType IntTypes[] = { 223 MVT::v2i32, MVT::v4i32 224 }; 225 226 for (MVT VT : IntTypes) { 227 //Expand the following operations for the current type by default 228 setOperationAction(ISD::ADD, VT, Expand); 229 setOperationAction(ISD::AND, VT, Expand); 230 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 231 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 232 setOperationAction(ISD::MUL, VT, Expand); 233 setOperationAction(ISD::OR, VT, Expand); 234 setOperationAction(ISD::SHL, VT, Expand); 235 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 236 setOperationAction(ISD::SRL, VT, Expand); 237 setOperationAction(ISD::SRA, VT, Expand); 238 setOperationAction(ISD::SUB, VT, Expand); 239 setOperationAction(ISD::UDIV, VT, Expand); 240 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 241 setOperationAction(ISD::UREM, VT, Expand); 242 setOperationAction(ISD::SELECT, VT, Expand); 243 setOperationAction(ISD::VSELECT, VT, Expand); 244 setOperationAction(ISD::XOR, VT, Expand); 245 } 246 247 static const MVT::SimpleValueType FloatTypes[] = { 248 MVT::v2f32, MVT::v4f32 249 }; 250 251 for (MVT VT : FloatTypes) { 252 setOperationAction(ISD::FABS, VT, Expand); 253 setOperationAction(ISD::FADD, VT, Expand); 254 setOperationAction(ISD::FCOS, VT, Expand); 255 setOperationAction(ISD::FDIV, VT, Expand); 256 setOperationAction(ISD::FPOW, VT, Expand); 257 setOperationAction(ISD::FFLOOR, VT, Expand); 258 setOperationAction(ISD::FTRUNC, VT, Expand); 259 setOperationAction(ISD::FMUL, VT, Expand); 260 setOperationAction(ISD::FRINT, VT, Expand); 261 setOperationAction(ISD::FSQRT, VT, Expand); 262 setOperationAction(ISD::FSIN, VT, Expand); 263 setOperationAction(ISD::FSUB, VT, Expand); 264 setOperationAction(ISD::SELECT, VT, Expand); 265 } 266 267 setTargetDAGCombine(ISD::MUL); 268 setTargetDAGCombine(ISD::SELECT_CC); 269} 270 271//===----------------------------------------------------------------------===// 272// Target Information 273//===----------------------------------------------------------------------===// 274 275MVT AMDGPUTargetLowering::getVectorIdxTy() const { 276 return MVT::i32; 277} 278 279bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, 280 EVT CastTy) const { 281 if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) 282 return true; 283 284 unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); 285 unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); 286 287 return ((LScalarSize <= CastScalarSize) || 288 (CastScalarSize >= 32) || 289 (LScalarSize < 32)); 290} 291 292//===---------------------------------------------------------------------===// 293// Target Properties 294//===---------------------------------------------------------------------===// 295 296bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 297 assert(VT.isFloatingPoint()); 298 return VT == MVT::f32; 299} 300 301bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 302 assert(VT.isFloatingPoint()); 303 return VT == MVT::f32; 304} 305 306bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 307 // Truncate is just accessing a subregister. 308 return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); 309} 310 311bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 312 // Truncate is just accessing a subregister. 313 return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && 314 (Dest->getPrimitiveSizeInBits() % 32 == 0); 315} 316 317bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 318 const DataLayout *DL = getDataLayout(); 319 unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); 320 unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); 321 322 return SrcSize == 32 && DestSize == 64; 323} 324 325bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 326 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 327 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 328 // this will enable reducing 64-bit operations the 32-bit, which is always 329 // good. 330 return Src == MVT::i32 && Dest == MVT::i64; 331} 332 333bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { 334 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 335 // limited number of native 64-bit operations. Shrinking an operation to fit 336 // in a single 32-bit register should always be helpful. As currently used, 337 // this is much less general than the name suggests, and is only used in 338 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 339 // not profitable, and may actually be harmful. 340 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 341} 342 343//===---------------------------------------------------------------------===// 344// TargetLowering Callbacks 345//===---------------------------------------------------------------------===// 346 347void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, 348 const SmallVectorImpl<ISD::InputArg> &Ins) const { 349 350 State.AnalyzeFormalArguments(Ins, CC_AMDGPU); 351} 352 353SDValue AMDGPUTargetLowering::LowerReturn( 354 SDValue Chain, 355 CallingConv::ID CallConv, 356 bool isVarArg, 357 const SmallVectorImpl<ISD::OutputArg> &Outs, 358 const SmallVectorImpl<SDValue> &OutVals, 359 SDLoc DL, SelectionDAG &DAG) const { 360 return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); 361} 362 363//===---------------------------------------------------------------------===// 364// Target specific lowering 365//===---------------------------------------------------------------------===// 366 367SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, 368 SmallVectorImpl<SDValue> &InVals) const { 369 SDValue Callee = CLI.Callee; 370 SelectionDAG &DAG = CLI.DAG; 371 372 const Function &Fn = *DAG.getMachineFunction().getFunction(); 373 374 StringRef FuncName("<unknown>"); 375 376 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) 377 FuncName = G->getSymbol(); 378 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 379 FuncName = G->getGlobal()->getName(); 380 381 DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); 382 DAG.getContext()->diagnose(NoCalls); 383 return SDValue(); 384} 385 386SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) 387 const { 388 switch (Op.getOpcode()) { 389 default: 390 Op.getNode()->dump(); 391 llvm_unreachable("Custom lowering code for this" 392 "instruction is not implemented yet!"); 393 break; 394 // AMDIL DAG lowering 395 case ISD::SDIV: return LowerSDIV(Op, DAG); 396 case ISD::SREM: return LowerSREM(Op, DAG); 397 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 398 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 399 // AMDGPU DAG lowering 400 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 401 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 402 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 403 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 404 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 405 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 406 } 407 return Op; 408} 409 410void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 411 SmallVectorImpl<SDValue> &Results, 412 SelectionDAG &DAG) const { 413 switch (N->getOpcode()) { 414 case ISD::SIGN_EXTEND_INREG: 415 // Different parts of legalization seem to interpret which type of 416 // sign_extend_inreg is the one to check for custom lowering. The extended 417 // from type is what really matters, but some places check for custom 418 // lowering of the result type. This results in trying to use 419 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 420 // nothing here and let the illegal result integer be handled normally. 421 return; 422 case ISD::UDIV: { 423 SDValue Op = SDValue(N, 0); 424 SDLoc DL(Op); 425 EVT VT = Op.getValueType(); 426 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), 427 N->getOperand(0), N->getOperand(1)); 428 Results.push_back(UDIVREM); 429 break; 430 } 431 case ISD::UREM: { 432 SDValue Op = SDValue(N, 0); 433 SDLoc DL(Op); 434 EVT VT = Op.getValueType(); 435 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), 436 N->getOperand(0), N->getOperand(1)); 437 Results.push_back(UDIVREM.getValue(1)); 438 break; 439 } 440 case ISD::UDIVREM: { 441 SDValue Op = SDValue(N, 0); 442 SDLoc DL(Op); 443 EVT VT = Op.getValueType(); 444 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 445 446 SDValue one = DAG.getConstant(1, HalfVT); 447 SDValue zero = DAG.getConstant(0, HalfVT); 448 449 //HiLo split 450 SDValue LHS = N->getOperand(0); 451 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); 452 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); 453 454 SDValue RHS = N->getOperand(1); 455 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); 456 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); 457 458 // Get Speculative values 459 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); 460 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); 461 462 SDValue REM_Hi = zero; 463 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); 464 465 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); 466 SDValue DIV_Lo = zero; 467 468 const unsigned halfBitWidth = HalfVT.getSizeInBits(); 469 470 for (unsigned i = 0; i < halfBitWidth; ++i) { 471 SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); 472 // Get Value of high bit 473 SDValue HBit; 474 if (halfBitWidth == 32 && Subtarget->hasBFE()) { 475 HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); 476 } else { 477 HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); 478 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); 479 } 480 481 SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, 482 DAG.getConstant(halfBitWidth - 1, HalfVT)); 483 REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); 484 REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); 485 486 REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); 487 REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); 488 489 490 SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); 491 492 SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); 493 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); 494 495 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); 496 497 // Update REM 498 499 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); 500 501 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); 502 REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); 503 REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); 504 } 505 506 SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); 507 SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); 508 Results.push_back(DIV); 509 Results.push_back(REM); 510 break; 511 } 512 default: 513 return; 514 } 515} 516 517// FIXME: This implements accesses to initialized globals in the constant 518// address space by copying them to private and accessing that. It does not 519// properly handle illegal types or vectors. The private vector loads are not 520// scalarized, and the illegal scalars hit an assertion. This technique will not 521// work well with large initializers, and this should eventually be 522// removed. Initialized globals should be placed into a data section that the 523// runtime will load into a buffer before the kernel is executed. Uses of the 524// global need to be replaced with a pointer loaded from an implicit kernel 525// argument into this buffer holding the copy of the data, which will remove the 526// need for any of this. 527SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, 528 const GlobalValue *GV, 529 const SDValue &InitPtr, 530 SDValue Chain, 531 SelectionDAG &DAG) const { 532 const DataLayout *TD = getTargetMachine().getDataLayout(); 533 SDLoc DL(InitPtr); 534 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { 535 EVT VT = EVT::getEVT(CI->getType()); 536 PointerType *PtrTy = PointerType::get(CI->getType(), 0); 537 return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, 538 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 539 TD->getPrefTypeAlignment(CI->getType())); 540 } 541 542 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { 543 EVT VT = EVT::getEVT(CFP->getType()); 544 PointerType *PtrTy = PointerType::get(CFP->getType(), 0); 545 return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr, 546 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 547 TD->getPrefTypeAlignment(CFP->getType())); 548 } 549 550 Type *InitTy = Init->getType(); 551 if (StructType *ST = dyn_cast<StructType>(InitTy)) { 552 const StructLayout *SL = TD->getStructLayout(ST); 553 554 EVT PtrVT = InitPtr.getValueType(); 555 SmallVector<SDValue, 8> Chains; 556 557 for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { 558 SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT); 559 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); 560 561 Constant *Elt = Init->getAggregateElement(I); 562 Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); 563 } 564 565 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 566 } 567 568 if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) { 569 EVT PtrVT = InitPtr.getValueType(); 570 571 unsigned NumElements; 572 if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy)) 573 NumElements = AT->getNumElements(); 574 else if (VectorType *VT = dyn_cast<VectorType>(SeqTy)) 575 NumElements = VT->getNumElements(); 576 else 577 llvm_unreachable("Unexpected type"); 578 579 unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); 580 SmallVector<SDValue, 8> Chains; 581 for (unsigned i = 0; i < NumElements; ++i) { 582 SDValue Offset = DAG.getConstant(i * EltSize, PtrVT); 583 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); 584 585 Constant *Elt = Init->getAggregateElement(i); 586 Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); 587 } 588 589 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 590 } 591 592 Init->dump(); 593 llvm_unreachable("Unhandled constant initializer"); 594} 595 596SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 597 SDValue Op, 598 SelectionDAG &DAG) const { 599 600 const DataLayout *TD = getTargetMachine().getDataLayout(); 601 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 602 const GlobalValue *GV = G->getGlobal(); 603 604 switch (G->getAddressSpace()) { 605 default: llvm_unreachable("Global Address lowering not implemented for this " 606 "address space"); 607 case AMDGPUAS::LOCAL_ADDRESS: { 608 // XXX: What does the value of G->getOffset() mean? 609 assert(G->getOffset() == 0 && 610 "Do not know what to do with an non-zero offset"); 611 612 unsigned Offset; 613 if (MFI->LocalMemoryObjects.count(GV) == 0) { 614 uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); 615 Offset = MFI->LDSSize; 616 MFI->LocalMemoryObjects[GV] = Offset; 617 // XXX: Account for alignment? 618 MFI->LDSSize += Size; 619 } else { 620 Offset = MFI->LocalMemoryObjects[GV]; 621 } 622 623 return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); 624 } 625 case AMDGPUAS::CONSTANT_ADDRESS: { 626 MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); 627 Type *EltType = GV->getType()->getElementType(); 628 unsigned Size = TD->getTypeAllocSize(EltType); 629 unsigned Alignment = TD->getPrefTypeAlignment(EltType); 630 631 const GlobalVariable *Var = cast<GlobalVariable>(GV); 632 const Constant *Init = Var->getInitializer(); 633 int FI = FrameInfo->CreateStackObject(Size, Alignment, false); 634 SDValue InitPtr = DAG.getFrameIndex(FI, 635 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); 636 SmallVector<SDNode*, 8> WorkList; 637 638 for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), 639 E = DAG.getEntryNode()->use_end(); I != E; ++I) { 640 if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) 641 continue; 642 WorkList.push_back(*I); 643 } 644 SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); 645 for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(), 646 E = WorkList.end(); I != E; ++I) { 647 SmallVector<SDValue, 8> Ops; 648 Ops.push_back(Chain); 649 for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { 650 Ops.push_back((*I)->getOperand(i)); 651 } 652 DAG.UpdateNodeOperands(*I, Ops); 653 } 654 return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), 655 getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); 656 } 657 } 658} 659 660SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 661 SelectionDAG &DAG) const { 662 SmallVector<SDValue, 8> Args; 663 SDValue A = Op.getOperand(0); 664 SDValue B = Op.getOperand(1); 665 666 DAG.ExtractVectorElements(A, Args); 667 DAG.ExtractVectorElements(B, Args); 668 669 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); 670} 671 672SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 673 SelectionDAG &DAG) const { 674 675 SmallVector<SDValue, 8> Args; 676 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 677 EVT VT = Op.getValueType(); 678 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, 679 VT.getVectorNumElements()); 680 681 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); 682} 683 684SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, 685 SelectionDAG &DAG) const { 686 687 MachineFunction &MF = DAG.getMachineFunction(); 688 const AMDGPUFrameLowering *TFL = 689 static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); 690 691 FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); 692 assert(FIN); 693 694 unsigned FrameIndex = FIN->getIndex(); 695 unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); 696 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), 697 Op.getValueType()); 698} 699 700SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 701 SelectionDAG &DAG) const { 702 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 703 SDLoc DL(Op); 704 EVT VT = Op.getValueType(); 705 706 switch (IntrinsicID) { 707 default: return Op; 708 case AMDGPUIntrinsic::AMDIL_abs: 709 return LowerIntrinsicIABS(Op, DAG); 710 case AMDGPUIntrinsic::AMDIL_exp: 711 return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); 712 case AMDGPUIntrinsic::AMDGPU_lrp: 713 return LowerIntrinsicLRP(Op, DAG); 714 case AMDGPUIntrinsic::AMDIL_fraction: 715 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 716 case AMDGPUIntrinsic::AMDIL_max: 717 return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1), 718 Op.getOperand(2)); 719 case AMDGPUIntrinsic::AMDGPU_imax: 720 return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), 721 Op.getOperand(2)); 722 case AMDGPUIntrinsic::AMDGPU_umax: 723 return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), 724 Op.getOperand(2)); 725 case AMDGPUIntrinsic::AMDIL_min: 726 return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1), 727 Op.getOperand(2)); 728 case AMDGPUIntrinsic::AMDGPU_imin: 729 return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), 730 Op.getOperand(2)); 731 case AMDGPUIntrinsic::AMDGPU_umin: 732 return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), 733 Op.getOperand(2)); 734 735 case AMDGPUIntrinsic::AMDGPU_umul24: 736 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, 737 Op.getOperand(1), Op.getOperand(2)); 738 739 case AMDGPUIntrinsic::AMDGPU_imul24: 740 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, 741 Op.getOperand(1), Op.getOperand(2)); 742 743 case AMDGPUIntrinsic::AMDGPU_umad24: 744 return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, 745 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 746 747 case AMDGPUIntrinsic::AMDGPU_imad24: 748 return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, 749 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 750 751 case AMDGPUIntrinsic::AMDGPU_bfe_i32: 752 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, 753 Op.getOperand(1), 754 Op.getOperand(2), 755 Op.getOperand(3)); 756 757 case AMDGPUIntrinsic::AMDGPU_bfe_u32: 758 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, 759 Op.getOperand(1), 760 Op.getOperand(2), 761 Op.getOperand(3)); 762 763 case AMDGPUIntrinsic::AMDGPU_bfi: 764 return DAG.getNode(AMDGPUISD::BFI, DL, VT, 765 Op.getOperand(1), 766 Op.getOperand(2), 767 Op.getOperand(3)); 768 769 case AMDGPUIntrinsic::AMDGPU_bfm: 770 return DAG.getNode(AMDGPUISD::BFM, DL, VT, 771 Op.getOperand(1), 772 Op.getOperand(2)); 773 774 case AMDGPUIntrinsic::AMDIL_round_nearest: 775 return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); 776 } 777} 778 779///IABS(a) = SMAX(sub(0, a), a) 780SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, 781 SelectionDAG &DAG) const { 782 SDLoc DL(Op); 783 EVT VT = Op.getValueType(); 784 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 785 Op.getOperand(1)); 786 787 return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1)); 788} 789 790/// Linear Interpolation 791/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) 792SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, 793 SelectionDAG &DAG) const { 794 SDLoc DL(Op); 795 EVT VT = Op.getValueType(); 796 SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, 797 DAG.getConstantFP(1.0f, MVT::f32), 798 Op.getOperand(1)); 799 SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, 800 Op.getOperand(3)); 801 return DAG.getNode(ISD::FADD, DL, VT, 802 DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), 803 OneSubAC); 804} 805 806/// \brief Generate Min/Max node 807SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, 808 SelectionDAG &DAG) const { 809 SDLoc DL(N); 810 EVT VT = N->getValueType(0); 811 812 SDValue LHS = N->getOperand(0); 813 SDValue RHS = N->getOperand(1); 814 SDValue True = N->getOperand(2); 815 SDValue False = N->getOperand(3); 816 SDValue CC = N->getOperand(4); 817 818 if (VT != MVT::f32 || 819 !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { 820 return SDValue(); 821 } 822 823 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 824 switch (CCOpcode) { 825 case ISD::SETOEQ: 826 case ISD::SETONE: 827 case ISD::SETUNE: 828 case ISD::SETNE: 829 case ISD::SETUEQ: 830 case ISD::SETEQ: 831 case ISD::SETFALSE: 832 case ISD::SETFALSE2: 833 case ISD::SETTRUE: 834 case ISD::SETTRUE2: 835 case ISD::SETUO: 836 case ISD::SETO: 837 llvm_unreachable("Operation should already be optimised!"); 838 case ISD::SETULE: 839 case ISD::SETULT: 840 case ISD::SETOLE: 841 case ISD::SETOLT: 842 case ISD::SETLE: 843 case ISD::SETLT: { 844 unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX; 845 return DAG.getNode(Opc, DL, VT, LHS, RHS); 846 } 847 case ISD::SETGT: 848 case ISD::SETGE: 849 case ISD::SETUGE: 850 case ISD::SETOGE: 851 case ISD::SETUGT: 852 case ISD::SETOGT: { 853 unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN; 854 return DAG.getNode(Opc, DL, VT, LHS, RHS); 855 } 856 case ISD::SETCC_INVALID: 857 llvm_unreachable("Invalid setcc condcode!"); 858 } 859 return SDValue(); 860} 861 862SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, 863 SelectionDAG &DAG) const { 864 LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); 865 EVT MemEltVT = Load->getMemoryVT().getVectorElementType(); 866 EVT EltVT = Op.getValueType().getVectorElementType(); 867 EVT PtrVT = Load->getBasePtr().getValueType(); 868 unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); 869 SmallVector<SDValue, 8> Loads; 870 SDLoc SL(Op); 871 872 for (unsigned i = 0, e = NumElts; i != e; ++i) { 873 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), 874 DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT)); 875 Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, 876 Load->getChain(), Ptr, 877 MachinePointerInfo(Load->getMemOperand()->getValue()), 878 MemEltVT, Load->isVolatile(), Load->isNonTemporal(), 879 Load->getAlignment())); 880 } 881 return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads); 882} 883 884SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, 885 SelectionDAG &DAG) const { 886 StoreSDNode *Store = dyn_cast<StoreSDNode>(Op); 887 EVT MemVT = Store->getMemoryVT(); 888 unsigned MemBits = MemVT.getSizeInBits(); 889 890 // Byte stores are really expensive, so if possible, try to pack 32-bit vector 891 // truncating store into an i32 store. 892 // XXX: We could also handle optimize other vector bitwidths. 893 if (!MemVT.isVector() || MemBits > 32) { 894 return SDValue(); 895 } 896 897 SDLoc DL(Op); 898 SDValue Value = Store->getValue(); 899 EVT VT = Value.getValueType(); 900 EVT ElemVT = VT.getVectorElementType(); 901 SDValue Ptr = Store->getBasePtr(); 902 EVT MemEltVT = MemVT.getVectorElementType(); 903 unsigned MemEltBits = MemEltVT.getSizeInBits(); 904 unsigned MemNumElements = MemVT.getVectorNumElements(); 905 unsigned PackedSize = MemVT.getStoreSizeInBits(); 906 SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32); 907 908 assert(Value.getValueType().getScalarSizeInBits() >= 32); 909 910 SDValue PackedValue; 911 for (unsigned i = 0; i < MemNumElements; ++i) { 912 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, 913 DAG.getConstant(i, MVT::i32)); 914 Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); 915 Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg 916 917 SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32); 918 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); 919 920 if (i == 0) { 921 PackedValue = Elt; 922 } else { 923 PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); 924 } 925 } 926 927 if (PackedSize < 32) { 928 EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); 929 return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, 930 Store->getMemOperand()->getPointerInfo(), 931 PackedVT, 932 Store->isNonTemporal(), Store->isVolatile(), 933 Store->getAlignment()); 934 } 935 936 return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, 937 Store->getMemOperand()->getPointerInfo(), 938 Store->isVolatile(), Store->isNonTemporal(), 939 Store->getAlignment()); 940} 941 942SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 943 SelectionDAG &DAG) const { 944 StoreSDNode *Store = cast<StoreSDNode>(Op); 945 EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); 946 EVT EltVT = Store->getValue().getValueType().getVectorElementType(); 947 EVT PtrVT = Store->getBasePtr().getValueType(); 948 unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); 949 SDLoc SL(Op); 950 951 SmallVector<SDValue, 8> Chains; 952 953 for (unsigned i = 0, e = NumElts; i != e; ++i) { 954 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, 955 Store->getValue(), DAG.getConstant(i, MVT::i32)); 956 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, 957 Store->getBasePtr(), 958 DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), 959 PtrVT)); 960 Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, 961 MachinePointerInfo(Store->getMemOperand()->getValue()), 962 MemEltVT, Store->isVolatile(), Store->isNonTemporal(), 963 Store->getAlignment())); 964 } 965 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); 966} 967 968SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 969 SDLoc DL(Op); 970 LoadSDNode *Load = cast<LoadSDNode>(Op); 971 ISD::LoadExtType ExtType = Load->getExtensionType(); 972 EVT VT = Op.getValueType(); 973 EVT MemVT = Load->getMemoryVT(); 974 975 if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) { 976 // We can do the extload to 32-bits, and then need to separately extend to 977 // 64-bits. 978 979 SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32, 980 Load->getChain(), 981 Load->getBasePtr(), 982 MemVT, 983 Load->getMemOperand()); 984 return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32); 985 } 986 987 if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { 988 assert(VT == MVT::i1 && "Only i1 non-extloads expected"); 989 // FIXME: Copied from PPC 990 // First, load into 32 bits, then truncate to 1 bit. 991 992 SDValue Chain = Load->getChain(); 993 SDValue BasePtr = Load->getBasePtr(); 994 MachineMemOperand *MMO = Load->getMemOperand(); 995 996 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, 997 BasePtr, MVT::i8, MMO); 998 return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); 999 } 1000 1001 // Lower loads constant address space global variable loads 1002 if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && 1003 isa<GlobalVariable>( 1004 GetUnderlyingObject(Load->getMemOperand()->getValue()))) { 1005 1006 SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, 1007 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); 1008 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1009 DAG.getConstant(2, MVT::i32)); 1010 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 1011 Load->getChain(), Ptr, 1012 DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); 1013 } 1014 1015 if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || 1016 ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) 1017 return SDValue(); 1018 1019 1020 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), 1021 DAG.getConstant(2, MVT::i32)); 1022 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 1023 Load->getChain(), Ptr, 1024 DAG.getTargetConstant(0, MVT::i32), 1025 Op.getOperand(2)); 1026 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 1027 Load->getBasePtr(), 1028 DAG.getConstant(0x3, MVT::i32)); 1029 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1030 DAG.getConstant(3, MVT::i32)); 1031 1032 Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); 1033 1034 EVT MemEltVT = MemVT.getScalarType(); 1035 if (ExtType == ISD::SEXTLOAD) { 1036 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 1037 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); 1038 } 1039 1040 return DAG.getZeroExtendInReg(Ret, DL, MemEltVT); 1041} 1042 1043SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1044 SDLoc DL(Op); 1045 SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); 1046 if (Result.getNode()) { 1047 return Result; 1048 } 1049 1050 StoreSDNode *Store = cast<StoreSDNode>(Op); 1051 SDValue Chain = Store->getChain(); 1052 if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1053 Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && 1054 Store->getValue().getValueType().isVector()) { 1055 return SplitVectorStore(Op, DAG); 1056 } 1057 1058 EVT MemVT = Store->getMemoryVT(); 1059 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && 1060 MemVT.bitsLT(MVT::i32)) { 1061 unsigned Mask = 0; 1062 if (Store->getMemoryVT() == MVT::i8) { 1063 Mask = 0xff; 1064 } else if (Store->getMemoryVT() == MVT::i16) { 1065 Mask = 0xffff; 1066 } 1067 SDValue BasePtr = Store->getBasePtr(); 1068 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, 1069 DAG.getConstant(2, MVT::i32)); 1070 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 1071 Chain, Ptr, DAG.getTargetConstant(0, MVT::i32)); 1072 1073 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, 1074 DAG.getConstant(0x3, MVT::i32)); 1075 1076 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1077 DAG.getConstant(3, MVT::i32)); 1078 1079 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 1080 Store->getValue()); 1081 1082 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 1083 1084 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 1085 MaskedValue, ShiftAmt); 1086 1087 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32), 1088 ShiftAmt); 1089 DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, 1090 DAG.getConstant(0xffffffff, MVT::i32)); 1091 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 1092 1093 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 1094 return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1095 Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32)); 1096 } 1097 return SDValue(); 1098} 1099 1100SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 1101 SelectionDAG &DAG) const { 1102 SDLoc DL(Op); 1103 EVT VT = Op.getValueType(); 1104 1105 SDValue Num = Op.getOperand(0); 1106 SDValue Den = Op.getOperand(1); 1107 1108 // RCP = URECIP(Den) = 2^32 / Den + e 1109 // e is rounding error. 1110 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); 1111 1112 // RCP_LO = umulo(RCP, Den) */ 1113 SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den); 1114 1115 // RCP_HI = mulhu (RCP, Den) */ 1116 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); 1117 1118 // NEG_RCP_LO = -RCP_LO 1119 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 1120 RCP_LO); 1121 1122 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 1123 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), 1124 NEG_RCP_LO, RCP_LO, 1125 ISD::SETEQ); 1126 // Calculate the rounding error from the URECIP instruction 1127 // E = mulhu(ABS_RCP_LO, RCP) 1128 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); 1129 1130 // RCP_A_E = RCP + E 1131 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); 1132 1133 // RCP_S_E = RCP - E 1134 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); 1135 1136 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 1137 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), 1138 RCP_A_E, RCP_S_E, 1139 ISD::SETEQ); 1140 // Quotient = mulhu(Tmp0, Num) 1141 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); 1142 1143 // Num_S_Remainder = Quotient * Den 1144 SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den); 1145 1146 // Remainder = Num - Num_S_Remainder 1147 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); 1148 1149 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 1150 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, 1151 DAG.getConstant(-1, VT), 1152 DAG.getConstant(0, VT), 1153 ISD::SETUGE); 1154 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 1155 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, 1156 Num_S_Remainder, 1157 DAG.getConstant(-1, VT), 1158 DAG.getConstant(0, VT), 1159 ISD::SETUGE); 1160 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 1161 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, 1162 Remainder_GE_Zero); 1163 1164 // Calculate Division result: 1165 1166 // Quotient_A_One = Quotient + 1 1167 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, 1168 DAG.getConstant(1, VT)); 1169 1170 // Quotient_S_One = Quotient - 1 1171 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, 1172 DAG.getConstant(1, VT)); 1173 1174 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 1175 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), 1176 Quotient, Quotient_A_One, ISD::SETEQ); 1177 1178 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 1179 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), 1180 Quotient_S_One, Div, ISD::SETEQ); 1181 1182 // Calculate Rem result: 1183 1184 // Remainder_S_Den = Remainder - Den 1185 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); 1186 1187 // Remainder_A_Den = Remainder + Den 1188 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); 1189 1190 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 1191 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), 1192 Remainder, Remainder_S_Den, ISD::SETEQ); 1193 1194 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 1195 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), 1196 Remainder_A_Den, Rem, ISD::SETEQ); 1197 SDValue Ops[2] = { 1198 Div, 1199 Rem 1200 }; 1201 return DAG.getMergeValues(Ops, DL); 1202} 1203 1204SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 1205 SelectionDAG &DAG) const { 1206 SDValue S0 = Op.getOperand(0); 1207 SDLoc DL(Op); 1208 if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64) 1209 return SDValue(); 1210 1211 // f32 uint_to_fp i64 1212 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, 1213 DAG.getConstant(0, MVT::i32)); 1214 SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); 1215 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, 1216 DAG.getConstant(1, MVT::i32)); 1217 SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); 1218 FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, 1219 DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32 1220 return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); 1221 1222} 1223 1224SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, 1225 unsigned BitsDiff, 1226 SelectionDAG &DAG) const { 1227 MVT VT = Op.getSimpleValueType(); 1228 SDLoc DL(Op); 1229 SDValue Shift = DAG.getConstant(BitsDiff, VT); 1230 // Shift left by 'Shift' bits. 1231 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift); 1232 // Signed shift Right by 'Shift' bits. 1233 return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift); 1234} 1235 1236SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 1237 SelectionDAG &DAG) const { 1238 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 1239 MVT VT = Op.getSimpleValueType(); 1240 MVT ScalarVT = VT.getScalarType(); 1241 1242 if (!VT.isVector()) 1243 return SDValue(); 1244 1245 SDValue Src = Op.getOperand(0); 1246 SDLoc DL(Op); 1247 1248 // TODO: Don't scalarize on Evergreen? 1249 unsigned NElts = VT.getVectorNumElements(); 1250 SmallVector<SDValue, 8> Args; 1251 DAG.ExtractVectorElements(Src, Args, 0, NElts); 1252 1253 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 1254 for (unsigned I = 0; I < NElts; ++I) 1255 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 1256 1257 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); 1258} 1259 1260//===----------------------------------------------------------------------===// 1261// Custom DAG optimizations 1262//===----------------------------------------------------------------------===// 1263 1264static bool isU24(SDValue Op, SelectionDAG &DAG) { 1265 APInt KnownZero, KnownOne; 1266 EVT VT = Op.getValueType(); 1267 DAG.computeKnownBits(Op, KnownZero, KnownOne); 1268 1269 return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; 1270} 1271 1272static bool isI24(SDValue Op, SelectionDAG &DAG) { 1273 EVT VT = Op.getValueType(); 1274 1275 // In order for this to be a signed 24-bit value, bit 23, must 1276 // be a sign bit. 1277 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated 1278 // as unsigned 24-bit values. 1279 (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; 1280} 1281 1282static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { 1283 1284 SelectionDAG &DAG = DCI.DAG; 1285 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 1286 EVT VT = Op.getValueType(); 1287 1288 APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); 1289 APInt KnownZero, KnownOne; 1290 TargetLowering::TargetLoweringOpt TLO(DAG, true, true); 1291 if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) 1292 DCI.CommitTargetLoweringOpt(TLO); 1293} 1294 1295template <typename IntTy> 1296static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, 1297 uint32_t Offset, uint32_t Width) { 1298 if (Width + Offset < 32) { 1299 IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width); 1300 return DAG.getConstant(Result, MVT::i32); 1301 } 1302 1303 return DAG.getConstant(Src0 >> Offset, MVT::i32); 1304} 1305 1306SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, 1307 DAGCombinerInfo &DCI) const { 1308 SelectionDAG &DAG = DCI.DAG; 1309 SDLoc DL(N); 1310 1311 switch(N->getOpcode()) { 1312 default: break; 1313 case ISD::MUL: { 1314 EVT VT = N->getValueType(0); 1315 SDValue N0 = N->getOperand(0); 1316 SDValue N1 = N->getOperand(1); 1317 SDValue Mul; 1318 1319 // FIXME: Add support for 24-bit multiply with 64-bit output on SI. 1320 if (VT.isVector() || VT.getSizeInBits() > 32) 1321 break; 1322 1323 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 1324 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 1325 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 1326 Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); 1327 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 1328 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 1329 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 1330 Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); 1331 } else { 1332 break; 1333 } 1334 1335 // We need to use sext even for MUL_U24, because MUL_U24 is used 1336 // for signed multiply of 8 and 16-bit types. 1337 SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT); 1338 1339 return Reg; 1340 } 1341 case AMDGPUISD::MUL_I24: 1342 case AMDGPUISD::MUL_U24: { 1343 SDValue N0 = N->getOperand(0); 1344 SDValue N1 = N->getOperand(1); 1345 simplifyI24(N0, DCI); 1346 simplifyI24(N1, DCI); 1347 return SDValue(); 1348 } 1349 case ISD::SELECT_CC: { 1350 return CombineMinMax(N, DAG); 1351 } 1352 case AMDGPUISD::BFE_I32: 1353 case AMDGPUISD::BFE_U32: { 1354 assert(!N->getValueType(0).isVector() && 1355 "Vector handling of BFE not implemented"); 1356 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 1357 if (!Width) 1358 break; 1359 1360 uint32_t WidthVal = Width->getZExtValue() & 0x1f; 1361 if (WidthVal == 0) 1362 return DAG.getConstant(0, MVT::i32); 1363 1364 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 1365 if (!Offset) 1366 break; 1367 1368 SDValue BitsFrom = N->getOperand(0); 1369 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; 1370 1371 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; 1372 1373 if (OffsetVal == 0) { 1374 // This is already sign / zero extended, so try to fold away extra BFEs. 1375 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); 1376 1377 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); 1378 if (OpSignBits >= SignBits) 1379 return BitsFrom; 1380 1381 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); 1382 if (Signed) { 1383 // This is a sign_extend_inreg. Replace it to take advantage of existing 1384 // DAG Combines. If not eliminated, we will match back to BFE during 1385 // selection. 1386 1387 // TODO: The sext_inreg of extended types ends, although we can could 1388 // handle them in a single BFE. 1389 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, 1390 DAG.getValueType(SmallVT)); 1391 } 1392 1393 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); 1394 } 1395 1396 if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 1397 if (Signed) { 1398 return constantFoldBFE<int32_t>(DAG, 1399 Val->getSExtValue(), 1400 OffsetVal, 1401 WidthVal); 1402 } 1403 1404 return constantFoldBFE<uint32_t>(DAG, 1405 Val->getZExtValue(), 1406 OffsetVal, 1407 WidthVal); 1408 } 1409 1410 APInt Demanded = APInt::getBitsSet(32, 1411 OffsetVal, 1412 OffsetVal + WidthVal); 1413 1414 if ((OffsetVal + WidthVal) >= 32) { 1415 SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32); 1416 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, 1417 BitsFrom, ShiftVal); 1418 } 1419 1420 APInt KnownZero, KnownOne; 1421 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 1422 !DCI.isBeforeLegalizeOps()); 1423 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 1424 if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || 1425 TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) { 1426 DCI.CommitTargetLoweringOpt(TLO); 1427 } 1428 1429 break; 1430 } 1431 } 1432 return SDValue(); 1433} 1434 1435//===----------------------------------------------------------------------===// 1436// Helper functions 1437//===----------------------------------------------------------------------===// 1438 1439void AMDGPUTargetLowering::getOriginalFunctionArgs( 1440 SelectionDAG &DAG, 1441 const Function *F, 1442 const SmallVectorImpl<ISD::InputArg> &Ins, 1443 SmallVectorImpl<ISD::InputArg> &OrigIns) const { 1444 1445 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1446 if (Ins[i].ArgVT == Ins[i].VT) { 1447 OrigIns.push_back(Ins[i]); 1448 continue; 1449 } 1450 1451 EVT VT; 1452 if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { 1453 // Vector has been split into scalars. 1454 VT = Ins[i].ArgVT.getVectorElementType(); 1455 } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && 1456 Ins[i].ArgVT.getVectorElementType() != 1457 Ins[i].VT.getVectorElementType()) { 1458 // Vector elements have been promoted 1459 VT = Ins[i].ArgVT; 1460 } else { 1461 // Vector has been spilt into smaller vectors. 1462 VT = Ins[i].VT; 1463 } 1464 1465 ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, 1466 Ins[i].OrigArgIndex, Ins[i].PartOffset); 1467 OrigIns.push_back(Arg); 1468 } 1469} 1470 1471bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { 1472 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 1473 return CFP->isExactlyValue(1.0); 1474 } 1475 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 1476 return C->isAllOnesValue(); 1477 } 1478 return false; 1479} 1480 1481bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { 1482 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 1483 return CFP->getValueAPF().isZero(); 1484 } 1485 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 1486 return C->isNullValue(); 1487 } 1488 return false; 1489} 1490 1491SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 1492 const TargetRegisterClass *RC, 1493 unsigned Reg, EVT VT) const { 1494 MachineFunction &MF = DAG.getMachineFunction(); 1495 MachineRegisterInfo &MRI = MF.getRegInfo(); 1496 unsigned VirtualRegister; 1497 if (!MRI.isLiveIn(Reg)) { 1498 VirtualRegister = MRI.createVirtualRegister(RC); 1499 MRI.addLiveIn(Reg, VirtualRegister); 1500 } else { 1501 VirtualRegister = MRI.getLiveInVirtReg(Reg); 1502 } 1503 return DAG.getRegister(VirtualRegister, VT); 1504} 1505 1506#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 1507 1508const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 1509 switch (Opcode) { 1510 default: return nullptr; 1511 // AMDIL DAG nodes 1512 NODE_NAME_CASE(CALL); 1513 NODE_NAME_CASE(UMUL); 1514 NODE_NAME_CASE(DIV_INF); 1515 NODE_NAME_CASE(RET_FLAG); 1516 NODE_NAME_CASE(BRANCH_COND); 1517 1518 // AMDGPU DAG nodes 1519 NODE_NAME_CASE(DWORDADDR) 1520 NODE_NAME_CASE(FRACT) 1521 NODE_NAME_CASE(FMAX) 1522 NODE_NAME_CASE(SMAX) 1523 NODE_NAME_CASE(UMAX) 1524 NODE_NAME_CASE(FMIN) 1525 NODE_NAME_CASE(SMIN) 1526 NODE_NAME_CASE(UMIN) 1527 NODE_NAME_CASE(BFE_U32) 1528 NODE_NAME_CASE(BFE_I32) 1529 NODE_NAME_CASE(BFI) 1530 NODE_NAME_CASE(BFM) 1531 NODE_NAME_CASE(MUL_U24) 1532 NODE_NAME_CASE(MUL_I24) 1533 NODE_NAME_CASE(MAD_U24) 1534 NODE_NAME_CASE(MAD_I24) 1535 NODE_NAME_CASE(URECIP) 1536 NODE_NAME_CASE(DOT4) 1537 NODE_NAME_CASE(EXPORT) 1538 NODE_NAME_CASE(CONST_ADDRESS) 1539 NODE_NAME_CASE(REGISTER_LOAD) 1540 NODE_NAME_CASE(REGISTER_STORE) 1541 NODE_NAME_CASE(LOAD_CONSTANT) 1542 NODE_NAME_CASE(LOAD_INPUT) 1543 NODE_NAME_CASE(SAMPLE) 1544 NODE_NAME_CASE(SAMPLEB) 1545 NODE_NAME_CASE(SAMPLED) 1546 NODE_NAME_CASE(SAMPLEL) 1547 NODE_NAME_CASE(STORE_MSKOR) 1548 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 1549 } 1550} 1551 1552static void computeKnownBitsForMinMax(const SDValue Op0, 1553 const SDValue Op1, 1554 APInt &KnownZero, 1555 APInt &KnownOne, 1556 const SelectionDAG &DAG, 1557 unsigned Depth) { 1558 APInt Op0Zero, Op0One; 1559 APInt Op1Zero, Op1One; 1560 DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); 1561 DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); 1562 1563 KnownZero = Op0Zero & Op1Zero; 1564 KnownOne = Op0One & Op1One; 1565} 1566 1567void AMDGPUTargetLowering::computeKnownBitsForTargetNode( 1568 const SDValue Op, 1569 APInt &KnownZero, 1570 APInt &KnownOne, 1571 const SelectionDAG &DAG, 1572 unsigned Depth) const { 1573 1574 KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. 1575 1576 APInt KnownZero2; 1577 APInt KnownOne2; 1578 unsigned Opc = Op.getOpcode(); 1579 1580 switch (Opc) { 1581 default: 1582 break; 1583 case ISD::INTRINSIC_WO_CHAIN: { 1584 // FIXME: The intrinsic should just use the node. 1585 switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { 1586 case AMDGPUIntrinsic::AMDGPU_imax: 1587 case AMDGPUIntrinsic::AMDGPU_umax: 1588 case AMDGPUIntrinsic::AMDGPU_imin: 1589 case AMDGPUIntrinsic::AMDGPU_umin: 1590 computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), 1591 KnownZero, KnownOne, DAG, Depth); 1592 break; 1593 default: 1594 break; 1595 } 1596 1597 break; 1598 } 1599 case AMDGPUISD::SMAX: 1600 case AMDGPUISD::UMAX: 1601 case AMDGPUISD::SMIN: 1602 case AMDGPUISD::UMIN: 1603 computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1), 1604 KnownZero, KnownOne, DAG, Depth); 1605 break; 1606 1607 case AMDGPUISD::BFE_I32: 1608 case AMDGPUISD::BFE_U32: { 1609 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 1610 if (!CWidth) 1611 return; 1612 1613 unsigned BitWidth = 32; 1614 uint32_t Width = CWidth->getZExtValue() & 0x1f; 1615 if (Width == 0) { 1616 KnownZero = APInt::getAllOnesValue(BitWidth); 1617 KnownOne = APInt::getNullValue(BitWidth); 1618 return; 1619 } 1620 1621 // FIXME: This could do a lot more. If offset is 0, should be the same as 1622 // sign_extend_inreg implementation, but that involves duplicating it. 1623 if (Opc == AMDGPUISD::BFE_I32) 1624 KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width); 1625 else 1626 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); 1627 1628 break; 1629 } 1630 } 1631} 1632 1633unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( 1634 SDValue Op, 1635 const SelectionDAG &DAG, 1636 unsigned Depth) const { 1637 switch (Op.getOpcode()) { 1638 case AMDGPUISD::BFE_I32: { 1639 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 1640 if (!Width) 1641 return 1; 1642 1643 unsigned SignBits = 32 - Width->getZExtValue() + 1; 1644 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 1645 if (!Offset || !Offset->isNullValue()) 1646 return SignBits; 1647 1648 // TODO: Could probably figure something out with non-0 offsets. 1649 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 1650 return std::max(SignBits, Op0SignBits); 1651 } 1652 1653 case AMDGPUISD::BFE_U32: { 1654 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 1655 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; 1656 } 1657 1658 default: 1659 return 1; 1660 } 1661} 1662