1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This file provides a helper that implements much of the TTI interface in 11/// terms of the target-independent code generator and TargetLowering 12/// interfaces. 13/// 14//===----------------------------------------------------------------------===// 15 16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17#define LLVM_CODEGEN_BASICTTIIMPL_H 18 19#include "llvm/Analysis/LoopInfo.h" 20#include "llvm/Analysis/TargetLibraryInfo.h" 21#include "llvm/Analysis/TargetTransformInfoImpl.h" 22#include "llvm/Support/CommandLine.h" 23#include "llvm/Target/TargetLowering.h" 24#include "llvm/Target/TargetSubtargetInfo.h" 25 26namespace llvm { 27 28extern cl::opt<unsigned> PartialUnrollingThreshold; 29 30/// \brief Base class which can be used to help build a TTI implementation. 31/// 32/// This class provides as much implementation of the TTI interface as is 33/// possible using the target independent parts of the code generator. 34/// 35/// In order to subclass it, your class must implement a getST() method to 36/// return the subtarget, and a getTLI() method to return the target lowering. 37/// We need these methods implemented in the derived class so that this class 38/// doesn't have to duplicate storage for them. 39template <typename T> 40class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 41private: 42 typedef TargetTransformInfoImplCRTPBase<T> BaseT; 43 typedef TargetTransformInfo TTI; 44 45 /// Estimate a cost of shuffle as a sequence of extract and insert 46 /// operations. 47 unsigned getPermuteShuffleOverhead(Type *Ty) { 48 assert(Ty->isVectorTy() && "Can only shuffle vectors"); 49 unsigned Cost = 0; 50 // Shuffle cost is equal to the cost of extracting element from its argument 51 // plus the cost of inserting them onto the result vector. 52 53 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 54 // index 0 of first vector, index 1 of second vector,index 2 of first 55 // vector and finally index 3 of second vector and insert them at index 56 // <0,1,2,3> of result vector. 57 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 58 Cost += static_cast<T *>(this) 59 ->getVectorInstrCost(Instruction::InsertElement, Ty, i); 60 Cost += static_cast<T *>(this) 61 ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 62 } 63 return Cost; 64 } 65 66 /// \brief Local query method delegates up to T which *must* implement this! 67 const TargetSubtargetInfo *getST() const { 68 return static_cast<const T *>(this)->getST(); 69 } 70 71 /// \brief Local query method delegates up to T which *must* implement this! 72 const TargetLoweringBase *getTLI() const { 73 return static_cast<const T *>(this)->getTLI(); 74 } 75 76protected: 77 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 78 : BaseT(DL) {} 79 80 using TargetTransformInfoImplBase::DL; 81 82public: 83 /// \name Scalar TTI Implementations 84 /// @{ 85 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, 86 unsigned BitWidth, unsigned AddressSpace, 87 unsigned Alignment, bool *Fast) const { 88 EVT E = EVT::getIntegerVT(Context, BitWidth); 89 return getTLI()->allowsMisalignedMemoryAccesses(E, AddressSpace, Alignment, Fast); 90 } 91 92 bool hasBranchDivergence() { return false; } 93 94 bool isSourceOfDivergence(const Value *V) { return false; } 95 96 bool isAlwaysUniform(const Value *V) { return false; } 97 98 unsigned getFlatAddressSpace() { 99 // Return an invalid address space. 100 return -1; 101 } 102 103 bool isLegalAddImmediate(int64_t imm) { 104 return getTLI()->isLegalAddImmediate(imm); 105 } 106 107 bool isLegalICmpImmediate(int64_t imm) { 108 return getTLI()->isLegalICmpImmediate(imm); 109 } 110 111 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 112 bool HasBaseReg, int64_t Scale, 113 unsigned AddrSpace) { 114 TargetLoweringBase::AddrMode AM; 115 AM.BaseGV = BaseGV; 116 AM.BaseOffs = BaseOffset; 117 AM.HasBaseReg = HasBaseReg; 118 AM.Scale = Scale; 119 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace); 120 } 121 122 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 123 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 124 } 125 126 int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 127 bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { 128 TargetLoweringBase::AddrMode AM; 129 AM.BaseGV = BaseGV; 130 AM.BaseOffs = BaseOffset; 131 AM.HasBaseReg = HasBaseReg; 132 AM.Scale = Scale; 133 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); 134 } 135 136 bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) { 137 return getTLI()->isFoldableMemAccessOffset(I, Offset); 138 } 139 140 bool isTruncateFree(Type *Ty1, Type *Ty2) { 141 return getTLI()->isTruncateFree(Ty1, Ty2); 142 } 143 144 bool isProfitableToHoist(Instruction *I) { 145 return getTLI()->isProfitableToHoist(I); 146 } 147 148 bool isTypeLegal(Type *Ty) { 149 EVT VT = getTLI()->getValueType(DL, Ty); 150 return getTLI()->isTypeLegal(VT); 151 } 152 153 int getGEPCost(Type *PointeeType, const Value *Ptr, 154 ArrayRef<const Value *> Operands) { 155 return BaseT::getGEPCost(PointeeType, Ptr, Operands); 156 } 157 158 unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, 159 ArrayRef<const Value *> Arguments) { 160 return BaseT::getIntrinsicCost(IID, RetTy, Arguments); 161 } 162 163 unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, 164 ArrayRef<Type *> ParamTys) { 165 if (IID == Intrinsic::cttz) { 166 if (getTLI()->isCheapToSpeculateCttz()) 167 return TargetTransformInfo::TCC_Basic; 168 return TargetTransformInfo::TCC_Expensive; 169 } 170 171 if (IID == Intrinsic::ctlz) { 172 if (getTLI()->isCheapToSpeculateCtlz()) 173 return TargetTransformInfo::TCC_Basic; 174 return TargetTransformInfo::TCC_Expensive; 175 } 176 177 return BaseT::getIntrinsicCost(IID, RetTy, ParamTys); 178 } 179 180 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 181 unsigned &JumpTableSize) { 182 /// Try to find the estimated number of clusters. Note that the number of 183 /// clusters identified in this function could be different from the actural 184 /// numbers found in lowering. This function ignore switches that are 185 /// lowered with a mix of jump table / bit test / BTree. This function was 186 /// initially intended to be used when estimating the cost of switch in 187 /// inline cost heuristic, but it's a generic cost model to be used in other 188 /// places (e.g., in loop unrolling). 189 unsigned N = SI.getNumCases(); 190 const TargetLoweringBase *TLI = getTLI(); 191 const DataLayout &DL = this->getDataLayout(); 192 193 JumpTableSize = 0; 194 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 195 196 // Early exit if both a jump table and bit test are not allowed. 197 if (N < 1 || (!IsJTAllowed && DL.getPointerSizeInBits() < N)) 198 return N; 199 200 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 201 APInt MinCaseVal = MaxCaseVal; 202 for (auto CI : SI.cases()) { 203 const APInt &CaseVal = CI.getCaseValue()->getValue(); 204 if (CaseVal.sgt(MaxCaseVal)) 205 MaxCaseVal = CaseVal; 206 if (CaseVal.slt(MinCaseVal)) 207 MinCaseVal = CaseVal; 208 } 209 210 // Check if suitable for a bit test 211 if (N <= DL.getPointerSizeInBits()) { 212 SmallPtrSet<const BasicBlock *, 4> Dests; 213 for (auto I : SI.cases()) 214 Dests.insert(I.getCaseSuccessor()); 215 216 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 217 DL)) 218 return 1; 219 } 220 221 // Check if suitable for a jump table. 222 if (IsJTAllowed) { 223 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 224 return N; 225 uint64_t Range = 226 (MaxCaseVal - MinCaseVal).getLimitedValue(UINT64_MAX - 1) + 1; 227 // Check whether a range of clusters is dense enough for a jump table 228 if (TLI->isSuitableForJumpTable(&SI, N, Range)) { 229 JumpTableSize = Range; 230 return 1; 231 } 232 } 233 return N; 234 } 235 236 unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); } 237 238 unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); } 239 240 bool shouldBuildLookupTables() { 241 const TargetLoweringBase *TLI = getTLI(); 242 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 243 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 244 } 245 246 bool haveFastSqrt(Type *Ty) { 247 const TargetLoweringBase *TLI = getTLI(); 248 EVT VT = TLI->getValueType(DL, Ty); 249 return TLI->isTypeLegal(VT) && 250 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 251 } 252 253 unsigned getFPOpCost(Type *Ty) { 254 // By default, FP instructions are no more expensive since they are 255 // implemented in HW. Target specific TTI can override this. 256 return TargetTransformInfo::TCC_Basic; 257 } 258 259 unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) { 260 const TargetLoweringBase *TLI = getTLI(); 261 switch (Opcode) { 262 default: break; 263 case Instruction::Trunc: { 264 if (TLI->isTruncateFree(OpTy, Ty)) 265 return TargetTransformInfo::TCC_Free; 266 return TargetTransformInfo::TCC_Basic; 267 } 268 case Instruction::ZExt: { 269 if (TLI->isZExtFree(OpTy, Ty)) 270 return TargetTransformInfo::TCC_Free; 271 return TargetTransformInfo::TCC_Basic; 272 } 273 } 274 275 return BaseT::getOperationCost(Opcode, Ty, OpTy); 276 } 277 278 unsigned getInliningThresholdMultiplier() { return 1; } 279 280 void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { 281 // This unrolling functionality is target independent, but to provide some 282 // motivation for its intended use, for x86: 283 284 // According to the Intel 64 and IA-32 Architectures Optimization Reference 285 // Manual, Intel Core models and later have a loop stream detector (and 286 // associated uop queue) that can benefit from partial unrolling. 287 // The relevant requirements are: 288 // - The loop must have no more than 4 (8 for Nehalem and later) branches 289 // taken, and none of them may be calls. 290 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 291 292 // According to the Software Optimization Guide for AMD Family 15h 293 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 294 // and loop buffer which can benefit from partial unrolling. 295 // The relevant requirements are: 296 // - The loop must have fewer than 16 branches 297 // - The loop must have less than 40 uops in all executed loop branches 298 299 // The number of taken branches in a loop is hard to estimate here, and 300 // benchmarking has revealed that it is better not to be conservative when 301 // estimating the branch count. As a result, we'll ignore the branch limits 302 // until someone finds a case where it matters in practice. 303 304 unsigned MaxOps; 305 const TargetSubtargetInfo *ST = getST(); 306 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 307 MaxOps = PartialUnrollingThreshold; 308 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 309 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 310 else 311 return; 312 313 // Scan the loop: don't unroll loops with calls. 314 for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; 315 ++I) { 316 BasicBlock *BB = *I; 317 318 for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) 319 if (isa<CallInst>(J) || isa<InvokeInst>(J)) { 320 ImmutableCallSite CS(&*J); 321 if (const Function *F = CS.getCalledFunction()) { 322 if (!static_cast<T *>(this)->isLoweredToCall(F)) 323 continue; 324 } 325 326 return; 327 } 328 } 329 330 // Enable runtime and partial unrolling up to the specified size. 331 // Enable using trip count upper bound to unroll loops. 332 UP.Partial = UP.Runtime = UP.UpperBound = true; 333 UP.PartialThreshold = MaxOps; 334 335 // Avoid unrolling when optimizing for size. 336 UP.OptSizeThreshold = 0; 337 UP.PartialOptSizeThreshold = 0; 338 339 // Set number of instructions optimized when "back edge" 340 // becomes "fall through" to default value of 2. 341 UP.BEInsns = 2; 342 } 343 344 /// @} 345 346 /// \name Vector TTI Implementations 347 /// @{ 348 349 unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; } 350 351 unsigned getRegisterBitWidth(bool Vector) const { return 32; } 352 353 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 354 /// are set if the result needs to be inserted and/or extracted from vectors. 355 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { 356 assert(Ty->isVectorTy() && "Can only scalarize vectors"); 357 unsigned Cost = 0; 358 359 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 360 if (Insert) 361 Cost += static_cast<T *>(this) 362 ->getVectorInstrCost(Instruction::InsertElement, Ty, i); 363 if (Extract) 364 Cost += static_cast<T *>(this) 365 ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 366 } 367 368 return Cost; 369 } 370 371 /// Estimate the overhead of scalarizing an instructions unique 372 /// non-constant operands. The types of the arguments are ordinarily 373 /// scalar, in which case the costs are multiplied with VF. 374 unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 375 unsigned VF) { 376 unsigned Cost = 0; 377 SmallPtrSet<const Value*, 4> UniqueOperands; 378 for (const Value *A : Args) { 379 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 380 Type *VecTy = nullptr; 381 if (A->getType()->isVectorTy()) { 382 VecTy = A->getType(); 383 // If A is a vector operand, VF should be 1 or correspond to A. 384 assert ((VF == 1 || VF == VecTy->getVectorNumElements()) && 385 "Vector argument does not match VF"); 386 } 387 else 388 VecTy = VectorType::get(A->getType(), VF); 389 390 Cost += getScalarizationOverhead(VecTy, false, true); 391 } 392 } 393 394 return Cost; 395 } 396 397 unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) { 398 assert (VecTy->isVectorTy()); 399 400 unsigned Cost = 0; 401 402 Cost += getScalarizationOverhead(VecTy, true, false); 403 if (!Args.empty()) 404 Cost += getOperandsScalarizationOverhead(Args, 405 VecTy->getVectorNumElements()); 406 else 407 // When no information on arguments is provided, we add the cost 408 // associated with one argument as a heuristic. 409 Cost += getScalarizationOverhead(VecTy, false, true); 410 411 return Cost; 412 } 413 414 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } 415 416 unsigned getArithmeticInstrCost( 417 unsigned Opcode, Type *Ty, 418 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 419 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 420 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 421 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 422 ArrayRef<const Value *> Args = ArrayRef<const Value *>()) { 423 // Check if any of the operands are vector operands. 424 const TargetLoweringBase *TLI = getTLI(); 425 int ISD = TLI->InstructionOpcodeToISD(Opcode); 426 assert(ISD && "Invalid opcode"); 427 428 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 429 430 bool IsFloat = Ty->getScalarType()->isFloatingPointTy(); 431 // Assume that floating point arithmetic operations cost twice as much as 432 // integer operations. 433 unsigned OpCost = (IsFloat ? 2 : 1); 434 435 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 436 // The operation is legal. Assume it costs 1. 437 // TODO: Once we have extract/insert subvector cost we need to use them. 438 return LT.first * OpCost; 439 } 440 441 if (!TLI->isOperationExpand(ISD, LT.second)) { 442 // If the operation is custom lowered, then assume that the code is twice 443 // as expensive. 444 return LT.first * 2 * OpCost; 445 } 446 447 // Else, assume that we need to scalarize this op. 448 // TODO: If one of the types get legalized by splitting, handle this 449 // similarly to what getCastInstrCost() does. 450 if (Ty->isVectorTy()) { 451 unsigned Num = Ty->getVectorNumElements(); 452 unsigned Cost = static_cast<T *>(this) 453 ->getArithmeticInstrCost(Opcode, Ty->getScalarType()); 454 // Return the cost of multiple scalar invocation plus the cost of 455 // inserting and extracting the values. 456 return getScalarizationOverhead(Ty, Args) + Num * Cost; 457 } 458 459 // We don't know anything about this scalar instruction. 460 return OpCost; 461 } 462 463 unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 464 Type *SubTp) { 465 if (Kind == TTI::SK_Alternate || Kind == TTI::SK_PermuteTwoSrc || 466 Kind == TTI::SK_PermuteSingleSrc) { 467 return getPermuteShuffleOverhead(Tp); 468 } 469 return 1; 470 } 471 472 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 473 const Instruction *I = nullptr) { 474 const TargetLoweringBase *TLI = getTLI(); 475 int ISD = TLI->InstructionOpcodeToISD(Opcode); 476 assert(ISD && "Invalid opcode"); 477 std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src); 478 std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst); 479 480 // Check for NOOP conversions. 481 if (SrcLT.first == DstLT.first && 482 SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { 483 484 // Bitcast between types that are legalized to the same type are free. 485 if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc) 486 return 0; 487 } 488 489 if (Opcode == Instruction::Trunc && 490 TLI->isTruncateFree(SrcLT.second, DstLT.second)) 491 return 0; 492 493 if (Opcode == Instruction::ZExt && 494 TLI->isZExtFree(SrcLT.second, DstLT.second)) 495 return 0; 496 497 if (Opcode == Instruction::AddrSpaceCast && 498 TLI->isNoopAddrSpaceCast(Src->getPointerAddressSpace(), 499 Dst->getPointerAddressSpace())) 500 return 0; 501 502 // If this is a zext/sext of a load, return 0 if the corresponding 503 // extending load exists on target. 504 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && 505 I && isa<LoadInst>(I->getOperand(0))) { 506 EVT ExtVT = EVT::getEVT(Dst); 507 EVT LoadVT = EVT::getEVT(Src); 508 unsigned LType = 509 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 510 if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 511 return 0; 512 } 513 514 // If the cast is marked as legal (or promote) then assume low cost. 515 if (SrcLT.first == DstLT.first && 516 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 517 return 1; 518 519 // Handle scalar conversions. 520 if (!Src->isVectorTy() && !Dst->isVectorTy()) { 521 522 // Scalar bitcasts are usually free. 523 if (Opcode == Instruction::BitCast) 524 return 0; 525 526 // Just check the op cost. If the operation is legal then assume it costs 527 // 1. 528 if (!TLI->isOperationExpand(ISD, DstLT.second)) 529 return 1; 530 531 // Assume that illegal scalar instruction are expensive. 532 return 4; 533 } 534 535 // Check vector-to-vector casts. 536 if (Dst->isVectorTy() && Src->isVectorTy()) { 537 538 // If the cast is between same-sized registers, then the check is simple. 539 if (SrcLT.first == DstLT.first && 540 SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { 541 542 // Assume that Zext is done using AND. 543 if (Opcode == Instruction::ZExt) 544 return 1; 545 546 // Assume that sext is done using SHL and SRA. 547 if (Opcode == Instruction::SExt) 548 return 2; 549 550 // Just check the op cost. If the operation is legal then assume it 551 // costs 552 // 1 and multiply by the type-legalization overhead. 553 if (!TLI->isOperationExpand(ISD, DstLT.second)) 554 return SrcLT.first * 1; 555 } 556 557 // If we are legalizing by splitting, query the concrete TTI for the cost 558 // of casting the original vector twice. We also need to factor int the 559 // cost of the split itself. Count that as 1, to be consistent with 560 // TLI->getTypeLegalizationCost(). 561 if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 562 TargetLowering::TypeSplitVector) || 563 (TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 564 TargetLowering::TypeSplitVector)) { 565 Type *SplitDst = VectorType::get(Dst->getVectorElementType(), 566 Dst->getVectorNumElements() / 2); 567 Type *SplitSrc = VectorType::get(Src->getVectorElementType(), 568 Src->getVectorNumElements() / 2); 569 T *TTI = static_cast<T *>(this); 570 return TTI->getVectorSplitCost() + 571 (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I)); 572 } 573 574 // In other cases where the source or destination are illegal, assume 575 // the operation will get scalarized. 576 unsigned Num = Dst->getVectorNumElements(); 577 unsigned Cost = static_cast<T *>(this)->getCastInstrCost( 578 Opcode, Dst->getScalarType(), Src->getScalarType(), I); 579 580 // Return the cost of multiple scalar invocation plus the cost of 581 // inserting and extracting the values. 582 return getScalarizationOverhead(Dst, true, true) + Num * Cost; 583 } 584 585 // We already handled vector-to-vector and scalar-to-scalar conversions. 586 // This 587 // is where we handle bitcast between vectors and scalars. We need to assume 588 // that the conversion is scalarized in one way or another. 589 if (Opcode == Instruction::BitCast) 590 // Illegal bitcasts are done by storing and loading from a stack slot. 591 return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true) 592 : 0) + 593 (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false) 594 : 0); 595 596 llvm_unreachable("Unhandled cast"); 597 } 598 599 unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, 600 VectorType *VecTy, unsigned Index) { 601 return static_cast<T *>(this)->getVectorInstrCost( 602 Instruction::ExtractElement, VecTy, Index) + 603 static_cast<T *>(this)->getCastInstrCost(Opcode, Dst, 604 VecTy->getElementType()); 605 } 606 607 unsigned getCFInstrCost(unsigned Opcode) { 608 // Branches are assumed to be predicted. 609 return 0; 610 } 611 612 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 613 const Instruction *I) { 614 const TargetLoweringBase *TLI = getTLI(); 615 int ISD = TLI->InstructionOpcodeToISD(Opcode); 616 assert(ISD && "Invalid opcode"); 617 618 // Selects on vectors are actually vector selects. 619 if (ISD == ISD::SELECT) { 620 assert(CondTy && "CondTy must exist"); 621 if (CondTy->isVectorTy()) 622 ISD = ISD::VSELECT; 623 } 624 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 625 626 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 627 !TLI->isOperationExpand(ISD, LT.second)) { 628 // The operation is legal. Assume it costs 1. Multiply 629 // by the type-legalization overhead. 630 return LT.first * 1; 631 } 632 633 // Otherwise, assume that the cast is scalarized. 634 // TODO: If one of the types get legalized by splitting, handle this 635 // similarly to what getCastInstrCost() does. 636 if (ValTy->isVectorTy()) { 637 unsigned Num = ValTy->getVectorNumElements(); 638 if (CondTy) 639 CondTy = CondTy->getScalarType(); 640 unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost( 641 Opcode, ValTy->getScalarType(), CondTy, I); 642 643 // Return the cost of multiple scalar invocation plus the cost of 644 // inserting and extracting the values. 645 return getScalarizationOverhead(ValTy, true, false) + Num * Cost; 646 } 647 648 // Unknown scalar opcode. 649 return 1; 650 } 651 652 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 653 std::pair<unsigned, MVT> LT = 654 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); 655 656 return LT.first; 657 } 658 659 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 660 unsigned AddressSpace, const Instruction *I = nullptr) { 661 assert(!Src->isVoidTy() && "Invalid type"); 662 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src); 663 664 // Assuming that all loads of legal types cost 1. 665 unsigned Cost = LT.first; 666 667 if (Src->isVectorTy() && 668 Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) { 669 // This is a vector load that legalizes to a larger type than the vector 670 // itself. Unless the corresponding extending load or truncating store is 671 // legal, then this will scalarize. 672 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 673 EVT MemVT = getTLI()->getValueType(DL, Src); 674 if (Opcode == Instruction::Store) 675 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 676 else 677 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 678 679 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 680 // This is a vector load/store for some illegal type that is scalarized. 681 // We must account for the cost of building or decomposing the vector. 682 Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store, 683 Opcode == Instruction::Store); 684 } 685 } 686 687 return Cost; 688 } 689 690 unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 691 unsigned Factor, 692 ArrayRef<unsigned> Indices, 693 unsigned Alignment, 694 unsigned AddressSpace) { 695 VectorType *VT = dyn_cast<VectorType>(VecTy); 696 assert(VT && "Expect a vector type for interleaved memory op"); 697 698 unsigned NumElts = VT->getNumElements(); 699 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 700 701 unsigned NumSubElts = NumElts / Factor; 702 VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); 703 704 // Firstly, the cost of load/store operation. 705 unsigned Cost = static_cast<T *>(this)->getMemoryOpCost( 706 Opcode, VecTy, Alignment, AddressSpace); 707 708 // Legalize the vector type, and get the legalized and unlegalized type 709 // sizes. 710 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 711 unsigned VecTySize = 712 static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy); 713 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 714 715 // Return the ceiling of dividing A by B. 716 auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; }; 717 718 // Scale the cost of the memory operation by the fraction of legalized 719 // instructions that will actually be used. We shouldn't account for the 720 // cost of dead instructions since they will be removed. 721 // 722 // E.g., An interleaved load of factor 8: 723 // %vec = load <16 x i64>, <16 x i64>* %ptr 724 // %v0 = shufflevector %vec, undef, <0, 8> 725 // 726 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 727 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 728 // type). The other loads are unused. 729 // 730 // We only scale the cost of loads since interleaved store groups aren't 731 // allowed to have gaps. 732 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { 733 734 // The number of loads of a legal type it will take to represent a load 735 // of the unlegalized vector type. 736 unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize); 737 738 // The number of elements of the unlegalized type that correspond to a 739 // single legal instruction. 740 unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts); 741 742 // Determine which legal instructions will be used. 743 BitVector UsedInsts(NumLegalInsts, false); 744 for (unsigned Index : Indices) 745 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 746 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 747 748 // Scale the cost of the load by the fraction of legal instructions that 749 // will be used. 750 Cost *= UsedInsts.count() / NumLegalInsts; 751 } 752 753 // Then plus the cost of interleave operation. 754 if (Opcode == Instruction::Load) { 755 // The interleave cost is similar to extract sub vectors' elements 756 // from the wide vector, and insert them into sub vectors. 757 // 758 // E.g. An interleaved load of factor 2 (with one member of index 0): 759 // %vec = load <8 x i32>, <8 x i32>* %ptr 760 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 761 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 762 // <8 x i32> vector and insert them into a <4 x i32> vector. 763 764 assert(Indices.size() <= Factor && 765 "Interleaved memory op has too many members"); 766 767 for (unsigned Index : Indices) { 768 assert(Index < Factor && "Invalid index for interleaved memory op"); 769 770 // Extract elements from loaded vector for each sub vector. 771 for (unsigned i = 0; i < NumSubElts; i++) 772 Cost += static_cast<T *>(this)->getVectorInstrCost( 773 Instruction::ExtractElement, VT, Index + i * Factor); 774 } 775 776 unsigned InsSubCost = 0; 777 for (unsigned i = 0; i < NumSubElts; i++) 778 InsSubCost += static_cast<T *>(this)->getVectorInstrCost( 779 Instruction::InsertElement, SubVT, i); 780 781 Cost += Indices.size() * InsSubCost; 782 } else { 783 // The interleave cost is extract all elements from sub vectors, and 784 // insert them into the wide vector. 785 // 786 // E.g. An interleaved store of factor 2: 787 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> 788 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr 789 // The cost is estimated as extract all elements from both <4 x i32> 790 // vectors and insert into the <8 x i32> vector. 791 792 unsigned ExtSubCost = 0; 793 for (unsigned i = 0; i < NumSubElts; i++) 794 ExtSubCost += static_cast<T *>(this)->getVectorInstrCost( 795 Instruction::ExtractElement, SubVT, i); 796 Cost += ExtSubCost * Factor; 797 798 for (unsigned i = 0; i < NumElts; i++) 799 Cost += static_cast<T *>(this) 800 ->getVectorInstrCost(Instruction::InsertElement, VT, i); 801 } 802 803 return Cost; 804 } 805 806 /// Get intrinsic cost based on arguments. 807 unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 808 ArrayRef<Value *> Args, FastMathFlags FMF, 809 unsigned VF = 1) { 810 unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); 811 assert ((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); 812 813 switch (IID) { 814 default: { 815 // Assume that we need to scalarize this intrinsic. 816 SmallVector<Type *, 4> Types; 817 for (Value *Op : Args) { 818 Type *OpTy = Op->getType(); 819 assert (VF == 1 || !OpTy->isVectorTy()); 820 Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF)); 821 } 822 823 if (VF > 1 && !RetTy->isVoidTy()) 824 RetTy = VectorType::get(RetTy, VF); 825 826 // Compute the scalarization overhead based on Args for a vector 827 // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while 828 // CostModel will pass a vector RetTy and VF is 1. 829 unsigned ScalarizationCost = UINT_MAX; 830 if (RetVF > 1 || VF > 1) { 831 ScalarizationCost = 0; 832 if (!RetTy->isVoidTy()) 833 ScalarizationCost += getScalarizationOverhead(RetTy, true, false); 834 ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); 835 } 836 837 return static_cast<T *>(this)-> 838 getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost); 839 } 840 case Intrinsic::masked_scatter: { 841 assert (VF == 1 && "Can't vectorize types here."); 842 Value *Mask = Args[3]; 843 bool VarMask = !isa<Constant>(Mask); 844 unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue(); 845 return 846 static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store, 847 Args[0]->getType(), 848 Args[1], VarMask, 849 Alignment); 850 } 851 case Intrinsic::masked_gather: { 852 assert (VF == 1 && "Can't vectorize types here."); 853 Value *Mask = Args[2]; 854 bool VarMask = !isa<Constant>(Mask); 855 unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue(); 856 return 857 static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load, 858 RetTy, Args[0], VarMask, 859 Alignment); 860 } 861 } 862 } 863 864 /// Get intrinsic cost based on argument types. 865 /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the 866 /// arguments and the return value will be computed based on types. 867 unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 868 ArrayRef<Type *> Tys, FastMathFlags FMF, 869 unsigned ScalarizationCostPassed = UINT_MAX) { 870 SmallVector<unsigned, 2> ISDs; 871 unsigned SingleCallCost = 10; // Library call cost. Make it expensive. 872 switch (IID) { 873 default: { 874 // Assume that we need to scalarize this intrinsic. 875 unsigned ScalarizationCost = ScalarizationCostPassed; 876 unsigned ScalarCalls = 1; 877 Type *ScalarRetTy = RetTy; 878 if (RetTy->isVectorTy()) { 879 if (ScalarizationCostPassed == UINT_MAX) 880 ScalarizationCost = getScalarizationOverhead(RetTy, true, false); 881 ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); 882 ScalarRetTy = RetTy->getScalarType(); 883 } 884 SmallVector<Type *, 4> ScalarTys; 885 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 886 Type *Ty = Tys[i]; 887 if (Ty->isVectorTy()) { 888 if (ScalarizationCostPassed == UINT_MAX) 889 ScalarizationCost += getScalarizationOverhead(Ty, false, true); 890 ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements()); 891 Ty = Ty->getScalarType(); 892 } 893 ScalarTys.push_back(Ty); 894 } 895 if (ScalarCalls == 1) 896 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 897 898 unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost( 899 IID, ScalarRetTy, ScalarTys, FMF); 900 901 return ScalarCalls * ScalarCost + ScalarizationCost; 902 } 903 // Look for intrinsics that can be lowered directly or turned into a scalar 904 // intrinsic call. 905 case Intrinsic::sqrt: 906 ISDs.push_back(ISD::FSQRT); 907 break; 908 case Intrinsic::sin: 909 ISDs.push_back(ISD::FSIN); 910 break; 911 case Intrinsic::cos: 912 ISDs.push_back(ISD::FCOS); 913 break; 914 case Intrinsic::exp: 915 ISDs.push_back(ISD::FEXP); 916 break; 917 case Intrinsic::exp2: 918 ISDs.push_back(ISD::FEXP2); 919 break; 920 case Intrinsic::log: 921 ISDs.push_back(ISD::FLOG); 922 break; 923 case Intrinsic::log10: 924 ISDs.push_back(ISD::FLOG10); 925 break; 926 case Intrinsic::log2: 927 ISDs.push_back(ISD::FLOG2); 928 break; 929 case Intrinsic::fabs: 930 ISDs.push_back(ISD::FABS); 931 break; 932 case Intrinsic::minnum: 933 ISDs.push_back(ISD::FMINNUM); 934 if (FMF.noNaNs()) 935 ISDs.push_back(ISD::FMINNAN); 936 break; 937 case Intrinsic::maxnum: 938 ISDs.push_back(ISD::FMAXNUM); 939 if (FMF.noNaNs()) 940 ISDs.push_back(ISD::FMAXNAN); 941 break; 942 case Intrinsic::copysign: 943 ISDs.push_back(ISD::FCOPYSIGN); 944 break; 945 case Intrinsic::floor: 946 ISDs.push_back(ISD::FFLOOR); 947 break; 948 case Intrinsic::ceil: 949 ISDs.push_back(ISD::FCEIL); 950 break; 951 case Intrinsic::trunc: 952 ISDs.push_back(ISD::FTRUNC); 953 break; 954 case Intrinsic::nearbyint: 955 ISDs.push_back(ISD::FNEARBYINT); 956 break; 957 case Intrinsic::rint: 958 ISDs.push_back(ISD::FRINT); 959 break; 960 case Intrinsic::round: 961 ISDs.push_back(ISD::FROUND); 962 break; 963 case Intrinsic::pow: 964 ISDs.push_back(ISD::FPOW); 965 break; 966 case Intrinsic::fma: 967 ISDs.push_back(ISD::FMA); 968 break; 969 case Intrinsic::fmuladd: 970 ISDs.push_back(ISD::FMA); 971 break; 972 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 973 case Intrinsic::lifetime_start: 974 case Intrinsic::lifetime_end: 975 return 0; 976 case Intrinsic::masked_store: 977 return static_cast<T *>(this) 978 ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0); 979 case Intrinsic::masked_load: 980 return static_cast<T *>(this) 981 ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0); 982 case Intrinsic::ctpop: 983 ISDs.push_back(ISD::CTPOP); 984 // In case of legalization use TCC_Expensive. This is cheaper than a 985 // library call but still not a cheap instruction. 986 SingleCallCost = TargetTransformInfo::TCC_Expensive; 987 break; 988 // FIXME: ctlz, cttz, ... 989 } 990 991 const TargetLoweringBase *TLI = getTLI(); 992 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 993 994 SmallVector<unsigned, 2> LegalCost; 995 SmallVector<unsigned, 2> CustomCost; 996 for (unsigned ISD : ISDs) { 997 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 998 if (IID == Intrinsic::fabs && TLI->isFAbsFree(LT.second)) { 999 return 0; 1000 } 1001 1002 // The operation is legal. Assume it costs 1. 1003 // If the type is split to multiple registers, assume that there is some 1004 // overhead to this. 1005 // TODO: Once we have extract/insert subvector cost we need to use them. 1006 if (LT.first > 1) 1007 LegalCost.push_back(LT.first * 2); 1008 else 1009 LegalCost.push_back(LT.first * 1); 1010 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 1011 // If the operation is custom lowered then assume 1012 // that the code is twice as expensive. 1013 CustomCost.push_back(LT.first * 2); 1014 } 1015 } 1016 1017 auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); 1018 if (MinLegalCostI != LegalCost.end()) 1019 return *MinLegalCostI; 1020 1021 auto MinCustomCostI = std::min_element(CustomCost.begin(), CustomCost.end()); 1022 if (MinCustomCostI != CustomCost.end()) 1023 return *MinCustomCostI; 1024 1025 // If we can't lower fmuladd into an FMA estimate the cost as a floating 1026 // point mul followed by an add. 1027 if (IID == Intrinsic::fmuladd) 1028 return static_cast<T *>(this) 1029 ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + 1030 static_cast<T *>(this) 1031 ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); 1032 1033 // Else, assume that we need to scalarize this intrinsic. For math builtins 1034 // this will emit a costly libcall, adding call overhead and spills. Make it 1035 // very expensive. 1036 if (RetTy->isVectorTy()) { 1037 unsigned ScalarizationCost = ((ScalarizationCostPassed != UINT_MAX) ? 1038 ScalarizationCostPassed : getScalarizationOverhead(RetTy, true, false)); 1039 unsigned ScalarCalls = RetTy->getVectorNumElements(); 1040 SmallVector<Type *, 4> ScalarTys; 1041 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1042 Type *Ty = Tys[i]; 1043 if (Ty->isVectorTy()) 1044 Ty = Ty->getScalarType(); 1045 ScalarTys.push_back(Ty); 1046 } 1047 unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost( 1048 IID, RetTy->getScalarType(), ScalarTys, FMF); 1049 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1050 if (Tys[i]->isVectorTy()) { 1051 if (ScalarizationCostPassed == UINT_MAX) 1052 ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); 1053 ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements()); 1054 } 1055 } 1056 1057 return ScalarCalls * ScalarCost + ScalarizationCost; 1058 } 1059 1060 // This is going to be turned into a library call, make it expensive. 1061 return SingleCallCost; 1062 } 1063 1064 /// \brief Compute a cost of the given call instruction. 1065 /// 1066 /// Compute the cost of calling function F with return type RetTy and 1067 /// argument types Tys. F might be nullptr, in this case the cost of an 1068 /// arbitrary call with the specified signature will be returned. 1069 /// This is used, for instance, when we estimate call of a vector 1070 /// counterpart of the given function. 1071 /// \param F Called function, might be nullptr. 1072 /// \param RetTy Return value types. 1073 /// \param Tys Argument types. 1074 /// \returns The cost of Call instruction. 1075 unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) { 1076 return 10; 1077 } 1078 1079 unsigned getNumberOfParts(Type *Tp) { 1080 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp); 1081 return LT.first; 1082 } 1083 1084 unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, 1085 const SCEV *) { 1086 return 0; 1087 } 1088 1089 /// Try to calculate arithmetic and shuffle op costs for reduction operations. 1090 /// We're assuming that reduction operation are performing the following way: 1091 /// 1. Non-pairwise reduction 1092 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1093 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 1094 /// \----------------v-------------/ \----------v------------/ 1095 /// n/2 elements n/2 elements 1096 /// %red1 = op <n x t> %val, <n x t> val1 1097 /// After this operation we have a vector %red1 where only the first n/2 1098 /// elements are meaningful, the second n/2 elements are undefined and can be 1099 /// dropped. All other operations are actually working with the vector of 1100 /// length n/2, not n, though the real vector length is still n. 1101 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 1102 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 1103 /// \----------------v-------------/ \----------v------------/ 1104 /// n/4 elements 3*n/4 elements 1105 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 1106 /// length n/2, the resulting vector has length n/4 etc. 1107 /// 2. Pairwise reduction: 1108 /// Everything is the same except for an additional shuffle operation which 1109 /// is used to produce operands for pairwise kind of reductions. 1110 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1111 /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef> 1112 /// \-------------v----------/ \----------v------------/ 1113 /// n/2 elements n/2 elements 1114 /// %val2 = shufflevector<n x t> %val, <n x t> %undef, 1115 /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef> 1116 /// \-------------v----------/ \----------v------------/ 1117 /// n/2 elements n/2 elements 1118 /// %red1 = op <n x t> %val1, <n x t> val2 1119 /// Again, the operation is performed on <n x t> vector, but the resulting 1120 /// vector %red1 is <n/2 x t> vector. 1121 /// 1122 /// The cost model should take into account that the actual length of the 1123 /// vector is reduced on each iteration. 1124 unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) { 1125 assert(Ty->isVectorTy() && "Expect a vector type"); 1126 Type *ScalarTy = Ty->getVectorElementType(); 1127 unsigned NumVecElts = Ty->getVectorNumElements(); 1128 unsigned NumReduxLevels = Log2_32(NumVecElts); 1129 unsigned ArithCost = 0; 1130 unsigned ShuffleCost = 0; 1131 auto *ConcreteTTI = static_cast<T *>(this); 1132 std::pair<unsigned, MVT> LT = 1133 ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty); 1134 unsigned LongVectorCount = 0; 1135 unsigned MVTLen = 1136 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 1137 while (NumVecElts > MVTLen) { 1138 NumVecElts /= 2; 1139 // Assume the pairwise shuffles add a cost. 1140 ShuffleCost += (IsPairwise + 1) * 1141 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, 1142 NumVecElts, Ty); 1143 ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty); 1144 Ty = VectorType::get(ScalarTy, NumVecElts); 1145 ++LongVectorCount; 1146 } 1147 // The minimal length of the vector is limited by the real length of vector 1148 // operations performed on the current platform. That's why several final 1149 // reduction opertions are perfomed on the vectors with the same 1150 // architecture-dependent length. 1151 ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) * 1152 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, 1153 NumVecElts, Ty); 1154 ArithCost += (NumReduxLevels - LongVectorCount) * 1155 ConcreteTTI->getArithmeticInstrCost(Opcode, Ty); 1156 return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true); 1157 } 1158 1159 unsigned getVectorSplitCost() { return 1; } 1160 1161 /// @} 1162}; 1163 1164/// \brief Concrete BasicTTIImpl that can be used if no further customization 1165/// is needed. 1166class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 1167 typedef BasicTTIImplBase<BasicTTIImpl> BaseT; 1168 friend class BasicTTIImplBase<BasicTTIImpl>; 1169 1170 const TargetSubtargetInfo *ST; 1171 const TargetLoweringBase *TLI; 1172 1173 const TargetSubtargetInfo *getST() const { return ST; } 1174 const TargetLoweringBase *getTLI() const { return TLI; } 1175 1176public: 1177 explicit BasicTTIImpl(const TargetMachine *ST, const Function &F); 1178}; 1179 1180} 1181 1182#endif 1183