1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// This file provides a helper that implements much of the TTI interface in
12/// terms of the target-independent code generator and TargetLowering
13/// interfaces.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
18#define LLVM_CODEGEN_BASICTTIIMPL_H
19
20#include "llvm/ADT/APInt.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/BitVector.h"
23#include "llvm/ADT/SmallPtrSet.h"
24#include "llvm/ADT/SmallVector.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/Analysis/TargetTransformInfo.h"
27#include "llvm/Analysis/TargetTransformInfoImpl.h"
28#include "llvm/CodeGen/ISDOpcodes.h"
29#include "llvm/CodeGen/MachineValueType.h"
30#include "llvm/CodeGen/ValueTypes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CallSite.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DataLayout.h"
36#include "llvm/IR/DerivedTypes.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Instruction.h"
39#include "llvm/IR/Instructions.h"
40#include "llvm/IR/Intrinsics.h"
41#include "llvm/IR/Operator.h"
42#include "llvm/IR/Type.h"
43#include "llvm/IR/Value.h"
44#include "llvm/MC/MCSchedule.h"
45#include "llvm/Support/Casting.h"
46#include "llvm/Support/CommandLine.h"
47#include "llvm/Support/ErrorHandling.h"
48#include "llvm/Support/MathExtras.h"
49#include "llvm/Target/TargetLowering.h"
50#include "llvm/Target/TargetSubtargetInfo.h"
51#include <algorithm>
52#include <cassert>
53#include <cstdint>
54#include <limits>
55#include <utility>
56
57namespace llvm {
58
59class Function;
60class GlobalValue;
61class LLVMContext;
62class ScalarEvolution;
63class SCEV;
64class TargetMachine;
65
66extern cl::opt<unsigned> PartialUnrollingThreshold;
67
68/// \brief Base class which can be used to help build a TTI implementation.
69///
70/// This class provides as much implementation of the TTI interface as is
71/// possible using the target independent parts of the code generator.
72///
73/// In order to subclass it, your class must implement a getST() method to
74/// return the subtarget, and a getTLI() method to return the target lowering.
75/// We need these methods implemented in the derived class so that this class
76/// doesn't have to duplicate storage for them.
77template <typename T>
78class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
79private:
80  using BaseT = TargetTransformInfoImplCRTPBase<T>;
81  using TTI = TargetTransformInfo;
82
83  /// Estimate a cost of shuffle as a sequence of extract and insert
84  /// operations.
85  unsigned getPermuteShuffleOverhead(Type *Ty) {
86    assert(Ty->isVectorTy() && "Can only shuffle vectors");
87    unsigned Cost = 0;
88    // Shuffle cost is equal to the cost of extracting element from its argument
89    // plus the cost of inserting them onto the result vector.
90
91    // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
92    // index 0 of first vector, index 1 of second vector,index 2 of first
93    // vector and finally index 3 of second vector and insert them at index
94    // <0,1,2,3> of result vector.
95    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
96      Cost += static_cast<T *>(this)
97                  ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
98      Cost += static_cast<T *>(this)
99                  ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
100    }
101    return Cost;
102  }
103
104  /// \brief Local query method delegates up to T which *must* implement this!
105  const TargetSubtargetInfo *getST() const {
106    return static_cast<const T *>(this)->getST();
107  }
108
109  /// \brief Local query method delegates up to T which *must* implement this!
110  const TargetLoweringBase *getTLI() const {
111    return static_cast<const T *>(this)->getTLI();
112  }
113
114protected:
115  explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
116      : BaseT(DL) {}
117
118  using TargetTransformInfoImplBase::DL;
119
120public:
121  /// \name Scalar TTI Implementations
122  /// @{
123  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
124                                      unsigned BitWidth, unsigned AddressSpace,
125                                      unsigned Alignment, bool *Fast) const {
126    EVT E = EVT::getIntegerVT(Context, BitWidth);
127    return getTLI()->allowsMisalignedMemoryAccesses(E, AddressSpace, Alignment, Fast);
128  }
129
130  bool hasBranchDivergence() { return false; }
131
132  bool isSourceOfDivergence(const Value *V) { return false; }
133
134  bool isAlwaysUniform(const Value *V) { return false; }
135
136  unsigned getFlatAddressSpace() {
137    // Return an invalid address space.
138    return -1;
139  }
140
141  bool isLegalAddImmediate(int64_t imm) {
142    return getTLI()->isLegalAddImmediate(imm);
143  }
144
145  bool isLegalICmpImmediate(int64_t imm) {
146    return getTLI()->isLegalICmpImmediate(imm);
147  }
148
149  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
150                             bool HasBaseReg, int64_t Scale,
151                             unsigned AddrSpace, Instruction *I = nullptr) {
152    TargetLoweringBase::AddrMode AM;
153    AM.BaseGV = BaseGV;
154    AM.BaseOffs = BaseOffset;
155    AM.HasBaseReg = HasBaseReg;
156    AM.Scale = Scale;
157    return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
158  }
159
160  bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
161    return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
162  }
163
164  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
165                           bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
166    TargetLoweringBase::AddrMode AM;
167    AM.BaseGV = BaseGV;
168    AM.BaseOffs = BaseOffset;
169    AM.HasBaseReg = HasBaseReg;
170    AM.Scale = Scale;
171    return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
172  }
173
174  bool isTruncateFree(Type *Ty1, Type *Ty2) {
175    return getTLI()->isTruncateFree(Ty1, Ty2);
176  }
177
178  bool isProfitableToHoist(Instruction *I) {
179    return getTLI()->isProfitableToHoist(I);
180  }
181
182  bool isTypeLegal(Type *Ty) {
183    EVT VT = getTLI()->getValueType(DL, Ty);
184    return getTLI()->isTypeLegal(VT);
185  }
186
187  int getGEPCost(Type *PointeeType, const Value *Ptr,
188                 ArrayRef<const Value *> Operands) {
189    return BaseT::getGEPCost(PointeeType, Ptr, Operands);
190  }
191
192  int getExtCost(const Instruction *I, const Value *Src) {
193    if (getTLI()->isExtFree(I))
194      return TargetTransformInfo::TCC_Free;
195
196    if (isa<ZExtInst>(I) || isa<SExtInst>(I))
197      if (const LoadInst *LI = dyn_cast<LoadInst>(Src))
198        if (getTLI()->isExtLoad(LI, I, DL))
199          return TargetTransformInfo::TCC_Free;
200
201    return TargetTransformInfo::TCC_Basic;
202  }
203
204  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
205                            ArrayRef<const Value *> Arguments) {
206    return BaseT::getIntrinsicCost(IID, RetTy, Arguments);
207  }
208
209  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
210                            ArrayRef<Type *> ParamTys) {
211    if (IID == Intrinsic::cttz) {
212      if (getTLI()->isCheapToSpeculateCttz())
213        return TargetTransformInfo::TCC_Basic;
214      return TargetTransformInfo::TCC_Expensive;
215    }
216
217    if (IID == Intrinsic::ctlz) {
218      if (getTLI()->isCheapToSpeculateCtlz())
219        return TargetTransformInfo::TCC_Basic;
220      return TargetTransformInfo::TCC_Expensive;
221    }
222
223    return BaseT::getIntrinsicCost(IID, RetTy, ParamTys);
224  }
225
226  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
227                                            unsigned &JumpTableSize) {
228    /// Try to find the estimated number of clusters. Note that the number of
229    /// clusters identified in this function could be different from the actural
230    /// numbers found in lowering. This function ignore switches that are
231    /// lowered with a mix of jump table / bit test / BTree. This function was
232    /// initially intended to be used when estimating the cost of switch in
233    /// inline cost heuristic, but it's a generic cost model to be used in other
234    /// places (e.g., in loop unrolling).
235    unsigned N = SI.getNumCases();
236    const TargetLoweringBase *TLI = getTLI();
237    const DataLayout &DL = this->getDataLayout();
238
239    JumpTableSize = 0;
240    bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
241
242    // Early exit if both a jump table and bit test are not allowed.
243    if (N < 1 || (!IsJTAllowed && DL.getPointerSizeInBits() < N))
244      return N;
245
246    APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
247    APInt MinCaseVal = MaxCaseVal;
248    for (auto CI : SI.cases()) {
249      const APInt &CaseVal = CI.getCaseValue()->getValue();
250      if (CaseVal.sgt(MaxCaseVal))
251        MaxCaseVal = CaseVal;
252      if (CaseVal.slt(MinCaseVal))
253        MinCaseVal = CaseVal;
254    }
255
256    // Check if suitable for a bit test
257    if (N <= DL.getPointerSizeInBits()) {
258      SmallPtrSet<const BasicBlock *, 4> Dests;
259      for (auto I : SI.cases())
260        Dests.insert(I.getCaseSuccessor());
261
262      if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
263                                     DL))
264        return 1;
265    }
266
267    // Check if suitable for a jump table.
268    if (IsJTAllowed) {
269      if (N < 2 || N < TLI->getMinimumJumpTableEntries())
270        return N;
271      uint64_t Range =
272          (MaxCaseVal - MinCaseVal)
273              .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
274      // Check whether a range of clusters is dense enough for a jump table
275      if (TLI->isSuitableForJumpTable(&SI, N, Range)) {
276        JumpTableSize = Range;
277        return 1;
278      }
279    }
280    return N;
281  }
282
283  unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); }
284
285  unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); }
286
287  bool shouldBuildLookupTables() {
288    const TargetLoweringBase *TLI = getTLI();
289    return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
290           TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
291  }
292
293  bool haveFastSqrt(Type *Ty) {
294    const TargetLoweringBase *TLI = getTLI();
295    EVT VT = TLI->getValueType(DL, Ty);
296    return TLI->isTypeLegal(VT) &&
297           TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
298  }
299
300  unsigned getFPOpCost(Type *Ty) {
301    // By default, FP instructions are no more expensive since they are
302    // implemented in HW.  Target specific TTI can override this.
303    return TargetTransformInfo::TCC_Basic;
304  }
305
306  unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) {
307    const TargetLoweringBase *TLI = getTLI();
308    switch (Opcode) {
309    default: break;
310    case Instruction::Trunc:
311      if (TLI->isTruncateFree(OpTy, Ty))
312        return TargetTransformInfo::TCC_Free;
313      return TargetTransformInfo::TCC_Basic;
314    case Instruction::ZExt:
315      if (TLI->isZExtFree(OpTy, Ty))
316        return TargetTransformInfo::TCC_Free;
317      return TargetTransformInfo::TCC_Basic;
318    }
319
320    return BaseT::getOperationCost(Opcode, Ty, OpTy);
321  }
322
323  unsigned getInliningThresholdMultiplier() { return 1; }
324
325  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
326                               TTI::UnrollingPreferences &UP) {
327    // This unrolling functionality is target independent, but to provide some
328    // motivation for its intended use, for x86:
329
330    // According to the Intel 64 and IA-32 Architectures Optimization Reference
331    // Manual, Intel Core models and later have a loop stream detector (and
332    // associated uop queue) that can benefit from partial unrolling.
333    // The relevant requirements are:
334    //  - The loop must have no more than 4 (8 for Nehalem and later) branches
335    //    taken, and none of them may be calls.
336    //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
337
338    // According to the Software Optimization Guide for AMD Family 15h
339    // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
340    // and loop buffer which can benefit from partial unrolling.
341    // The relevant requirements are:
342    //  - The loop must have fewer than 16 branches
343    //  - The loop must have less than 40 uops in all executed loop branches
344
345    // The number of taken branches in a loop is hard to estimate here, and
346    // benchmarking has revealed that it is better not to be conservative when
347    // estimating the branch count. As a result, we'll ignore the branch limits
348    // until someone finds a case where it matters in practice.
349
350    unsigned MaxOps;
351    const TargetSubtargetInfo *ST = getST();
352    if (PartialUnrollingThreshold.getNumOccurrences() > 0)
353      MaxOps = PartialUnrollingThreshold;
354    else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
355      MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
356    else
357      return;
358
359    // Scan the loop: don't unroll loops with calls.
360    for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
361         ++I) {
362      BasicBlock *BB = *I;
363
364      for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
365        if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
366          ImmutableCallSite CS(&*J);
367          if (const Function *F = CS.getCalledFunction()) {
368            if (!static_cast<T *>(this)->isLoweredToCall(F))
369              continue;
370          }
371
372          return;
373        }
374    }
375
376    // Enable runtime and partial unrolling up to the specified size.
377    // Enable using trip count upper bound to unroll loops.
378    UP.Partial = UP.Runtime = UP.UpperBound = true;
379    UP.PartialThreshold = MaxOps;
380
381    // Avoid unrolling when optimizing for size.
382    UP.OptSizeThreshold = 0;
383    UP.PartialOptSizeThreshold = 0;
384
385    // Set number of instructions optimized when "back edge"
386    // becomes "fall through" to default value of 2.
387    UP.BEInsns = 2;
388  }
389
390  int getInstructionLatency(const Instruction *I) {
391    if (isa<LoadInst>(I))
392      return getST()->getSchedModel().DefaultLoadLatency;
393
394    return BaseT::getInstructionLatency(I);
395  }
396
397  /// @}
398
399  /// \name Vector TTI Implementations
400  /// @{
401
402  unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; }
403
404  unsigned getRegisterBitWidth(bool Vector) const { return 32; }
405
406  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
407  /// are set if the result needs to be inserted and/or extracted from vectors.
408  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
409    assert(Ty->isVectorTy() && "Can only scalarize vectors");
410    unsigned Cost = 0;
411
412    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
413      if (Insert)
414        Cost += static_cast<T *>(this)
415                    ->getVectorInstrCost(Instruction::InsertElement, Ty, i);
416      if (Extract)
417        Cost += static_cast<T *>(this)
418                    ->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
419    }
420
421    return Cost;
422  }
423
424  /// Estimate the overhead of scalarizing an instructions unique
425  /// non-constant operands. The types of the arguments are ordinarily
426  /// scalar, in which case the costs are multiplied with VF.
427  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
428                                            unsigned VF) {
429    unsigned Cost = 0;
430    SmallPtrSet<const Value*, 4> UniqueOperands;
431    for (const Value *A : Args) {
432      if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
433        Type *VecTy = nullptr;
434        if (A->getType()->isVectorTy()) {
435          VecTy = A->getType();
436          // If A is a vector operand, VF should be 1 or correspond to A.
437          assert((VF == 1 || VF == VecTy->getVectorNumElements()) &&
438                 "Vector argument does not match VF");
439        }
440        else
441          VecTy = VectorType::get(A->getType(), VF);
442
443        Cost += getScalarizationOverhead(VecTy, false, true);
444      }
445    }
446
447    return Cost;
448  }
449
450  unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) {
451    assert(VecTy->isVectorTy());
452
453    unsigned Cost = 0;
454
455    Cost += getScalarizationOverhead(VecTy, true, false);
456    if (!Args.empty())
457      Cost += getOperandsScalarizationOverhead(Args,
458                                               VecTy->getVectorNumElements());
459    else
460      // When no information on arguments is provided, we add the cost
461      // associated with one argument as a heuristic.
462      Cost += getScalarizationOverhead(VecTy, false, true);
463
464    return Cost;
465  }
466
467  unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
468
469  unsigned getArithmeticInstrCost(
470      unsigned Opcode, Type *Ty,
471      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
472      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
473      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
474      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
475      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
476    // Check if any of the operands are vector operands.
477    const TargetLoweringBase *TLI = getTLI();
478    int ISD = TLI->InstructionOpcodeToISD(Opcode);
479    assert(ISD && "Invalid opcode");
480
481    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
482
483    bool IsFloat = Ty->isFPOrFPVectorTy();
484    // Assume that floating point arithmetic operations cost twice as much as
485    // integer operations.
486    unsigned OpCost = (IsFloat ? 2 : 1);
487
488    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
489      // The operation is legal. Assume it costs 1.
490      // TODO: Once we have extract/insert subvector cost we need to use them.
491      return LT.first * OpCost;
492    }
493
494    if (!TLI->isOperationExpand(ISD, LT.second)) {
495      // If the operation is custom lowered, then assume that the code is twice
496      // as expensive.
497      return LT.first * 2 * OpCost;
498    }
499
500    // Else, assume that we need to scalarize this op.
501    // TODO: If one of the types get legalized by splitting, handle this
502    // similarly to what getCastInstrCost() does.
503    if (Ty->isVectorTy()) {
504      unsigned Num = Ty->getVectorNumElements();
505      unsigned Cost = static_cast<T *>(this)
506                          ->getArithmeticInstrCost(Opcode, Ty->getScalarType());
507      // Return the cost of multiple scalar invocation plus the cost of
508      // inserting and extracting the values.
509      return getScalarizationOverhead(Ty, Args) + Num * Cost;
510    }
511
512    // We don't know anything about this scalar instruction.
513    return OpCost;
514  }
515
516  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
517                          Type *SubTp) {
518    if (Kind == TTI::SK_Alternate || Kind == TTI::SK_PermuteTwoSrc ||
519        Kind == TTI::SK_PermuteSingleSrc) {
520      return getPermuteShuffleOverhead(Tp);
521    }
522    return 1;
523  }
524
525  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
526                            const Instruction *I = nullptr) {
527    const TargetLoweringBase *TLI = getTLI();
528    int ISD = TLI->InstructionOpcodeToISD(Opcode);
529    assert(ISD && "Invalid opcode");
530    std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src);
531    std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst);
532
533    // Check for NOOP conversions.
534    if (SrcLT.first == DstLT.first &&
535        SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
536
537      // Bitcast between types that are legalized to the same type are free.
538      if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
539        return 0;
540    }
541
542    if (Opcode == Instruction::Trunc &&
543        TLI->isTruncateFree(SrcLT.second, DstLT.second))
544      return 0;
545
546    if (Opcode == Instruction::ZExt &&
547        TLI->isZExtFree(SrcLT.second, DstLT.second))
548      return 0;
549
550    if (Opcode == Instruction::AddrSpaceCast &&
551        TLI->isNoopAddrSpaceCast(Src->getPointerAddressSpace(),
552                                 Dst->getPointerAddressSpace()))
553      return 0;
554
555    // If this is a zext/sext of a load, return 0 if the corresponding
556    // extending load exists on target.
557    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
558        I && isa<LoadInst>(I->getOperand(0))) {
559        EVT ExtVT = EVT::getEVT(Dst);
560        EVT LoadVT = EVT::getEVT(Src);
561        unsigned LType =
562          ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
563        if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
564          return 0;
565    }
566
567    // If the cast is marked as legal (or promote) then assume low cost.
568    if (SrcLT.first == DstLT.first &&
569        TLI->isOperationLegalOrPromote(ISD, DstLT.second))
570      return 1;
571
572    // Handle scalar conversions.
573    if (!Src->isVectorTy() && !Dst->isVectorTy()) {
574      // Scalar bitcasts are usually free.
575      if (Opcode == Instruction::BitCast)
576        return 0;
577
578      // Just check the op cost. If the operation is legal then assume it costs
579      // 1.
580      if (!TLI->isOperationExpand(ISD, DstLT.second))
581        return 1;
582
583      // Assume that illegal scalar instruction are expensive.
584      return 4;
585    }
586
587    // Check vector-to-vector casts.
588    if (Dst->isVectorTy() && Src->isVectorTy()) {
589      // If the cast is between same-sized registers, then the check is simple.
590      if (SrcLT.first == DstLT.first &&
591          SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
592
593        // Assume that Zext is done using AND.
594        if (Opcode == Instruction::ZExt)
595          return 1;
596
597        // Assume that sext is done using SHL and SRA.
598        if (Opcode == Instruction::SExt)
599          return 2;
600
601        // Just check the op cost. If the operation is legal then assume it
602        // costs
603        // 1 and multiply by the type-legalization overhead.
604        if (!TLI->isOperationExpand(ISD, DstLT.second))
605          return SrcLT.first * 1;
606      }
607
608      // If we are legalizing by splitting, query the concrete TTI for the cost
609      // of casting the original vector twice. We also need to factor int the
610      // cost of the split itself. Count that as 1, to be consistent with
611      // TLI->getTypeLegalizationCost().
612      if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
613           TargetLowering::TypeSplitVector) ||
614          (TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
615           TargetLowering::TypeSplitVector)) {
616        Type *SplitDst = VectorType::get(Dst->getVectorElementType(),
617                                         Dst->getVectorNumElements() / 2);
618        Type *SplitSrc = VectorType::get(Src->getVectorElementType(),
619                                         Src->getVectorNumElements() / 2);
620        T *TTI = static_cast<T *>(this);
621        return TTI->getVectorSplitCost() +
622               (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I));
623      }
624
625      // In other cases where the source or destination are illegal, assume
626      // the operation will get scalarized.
627      unsigned Num = Dst->getVectorNumElements();
628      unsigned Cost = static_cast<T *>(this)->getCastInstrCost(
629          Opcode, Dst->getScalarType(), Src->getScalarType(), I);
630
631      // Return the cost of multiple scalar invocation plus the cost of
632      // inserting and extracting the values.
633      return getScalarizationOverhead(Dst, true, true) + Num * Cost;
634    }
635
636    // We already handled vector-to-vector and scalar-to-scalar conversions.
637    // This
638    // is where we handle bitcast between vectors and scalars. We need to assume
639    //  that the conversion is scalarized in one way or another.
640    if (Opcode == Instruction::BitCast)
641      // Illegal bitcasts are done by storing and loading from a stack slot.
642      return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true)
643                                : 0) +
644             (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false)
645                                : 0);
646
647    llvm_unreachable("Unhandled cast");
648  }
649
650  unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
651                                    VectorType *VecTy, unsigned Index) {
652    return static_cast<T *>(this)->getVectorInstrCost(
653               Instruction::ExtractElement, VecTy, Index) +
654           static_cast<T *>(this)->getCastInstrCost(Opcode, Dst,
655                                                    VecTy->getElementType());
656  }
657
658  unsigned getCFInstrCost(unsigned Opcode) {
659    // Branches are assumed to be predicted.
660    return 0;
661  }
662
663  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
664                              const Instruction *I) {
665    const TargetLoweringBase *TLI = getTLI();
666    int ISD = TLI->InstructionOpcodeToISD(Opcode);
667    assert(ISD && "Invalid opcode");
668
669    // Selects on vectors are actually vector selects.
670    if (ISD == ISD::SELECT) {
671      assert(CondTy && "CondTy must exist");
672      if (CondTy->isVectorTy())
673        ISD = ISD::VSELECT;
674    }
675    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
676
677    if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
678        !TLI->isOperationExpand(ISD, LT.second)) {
679      // The operation is legal. Assume it costs 1. Multiply
680      // by the type-legalization overhead.
681      return LT.first * 1;
682    }
683
684    // Otherwise, assume that the cast is scalarized.
685    // TODO: If one of the types get legalized by splitting, handle this
686    // similarly to what getCastInstrCost() does.
687    if (ValTy->isVectorTy()) {
688      unsigned Num = ValTy->getVectorNumElements();
689      if (CondTy)
690        CondTy = CondTy->getScalarType();
691      unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost(
692          Opcode, ValTy->getScalarType(), CondTy, I);
693
694      // Return the cost of multiple scalar invocation plus the cost of
695      // inserting and extracting the values.
696      return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
697    }
698
699    // Unknown scalar opcode.
700    return 1;
701  }
702
703  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
704    std::pair<unsigned, MVT> LT =
705        getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
706
707    return LT.first;
708  }
709
710  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
711                       unsigned AddressSpace, const Instruction *I = nullptr) {
712    assert(!Src->isVoidTy() && "Invalid type");
713    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);
714
715    // Assuming that all loads of legal types cost 1.
716    unsigned Cost = LT.first;
717
718    if (Src->isVectorTy() &&
719        Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
720      // This is a vector load that legalizes to a larger type than the vector
721      // itself. Unless the corresponding extending load or truncating store is
722      // legal, then this will scalarize.
723      TargetLowering::LegalizeAction LA = TargetLowering::Expand;
724      EVT MemVT = getTLI()->getValueType(DL, Src);
725      if (Opcode == Instruction::Store)
726        LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
727      else
728        LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
729
730      if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
731        // This is a vector load/store for some illegal type that is scalarized.
732        // We must account for the cost of building or decomposing the vector.
733        Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,
734                                         Opcode == Instruction::Store);
735      }
736    }
737
738    return Cost;
739  }
740
741  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
742                                      unsigned Factor,
743                                      ArrayRef<unsigned> Indices,
744                                      unsigned Alignment,
745                                      unsigned AddressSpace) {
746    VectorType *VT = dyn_cast<VectorType>(VecTy);
747    assert(VT && "Expect a vector type for interleaved memory op");
748
749    unsigned NumElts = VT->getNumElements();
750    assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
751
752    unsigned NumSubElts = NumElts / Factor;
753    VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
754
755    // Firstly, the cost of load/store operation.
756    unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
757        Opcode, VecTy, Alignment, AddressSpace);
758
759    // Legalize the vector type, and get the legalized and unlegalized type
760    // sizes.
761    MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
762    unsigned VecTySize =
763        static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy);
764    unsigned VecTyLTSize = VecTyLT.getStoreSize();
765
766    // Return the ceiling of dividing A by B.
767    auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
768
769    // Scale the cost of the memory operation by the fraction of legalized
770    // instructions that will actually be used. We shouldn't account for the
771    // cost of dead instructions since they will be removed.
772    //
773    // E.g., An interleaved load of factor 8:
774    //       %vec = load <16 x i64>, <16 x i64>* %ptr
775    //       %v0 = shufflevector %vec, undef, <0, 8>
776    //
777    // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
778    // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
779    // type). The other loads are unused.
780    //
781    // We only scale the cost of loads since interleaved store groups aren't
782    // allowed to have gaps.
783    if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
784      // The number of loads of a legal type it will take to represent a load
785      // of the unlegalized vector type.
786      unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize);
787
788      // The number of elements of the unlegalized type that correspond to a
789      // single legal instruction.
790      unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts);
791
792      // Determine which legal instructions will be used.
793      BitVector UsedInsts(NumLegalInsts, false);
794      for (unsigned Index : Indices)
795        for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
796          UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
797
798      // Scale the cost of the load by the fraction of legal instructions that
799      // will be used.
800      Cost *= UsedInsts.count() / NumLegalInsts;
801    }
802
803    // Then plus the cost of interleave operation.
804    if (Opcode == Instruction::Load) {
805      // The interleave cost is similar to extract sub vectors' elements
806      // from the wide vector, and insert them into sub vectors.
807      //
808      // E.g. An interleaved load of factor 2 (with one member of index 0):
809      //      %vec = load <8 x i32>, <8 x i32>* %ptr
810      //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
811      // The cost is estimated as extract elements at 0, 2, 4, 6 from the
812      // <8 x i32> vector and insert them into a <4 x i32> vector.
813
814      assert(Indices.size() <= Factor &&
815             "Interleaved memory op has too many members");
816
817      for (unsigned Index : Indices) {
818        assert(Index < Factor && "Invalid index for interleaved memory op");
819
820        // Extract elements from loaded vector for each sub vector.
821        for (unsigned i = 0; i < NumSubElts; i++)
822          Cost += static_cast<T *>(this)->getVectorInstrCost(
823              Instruction::ExtractElement, VT, Index + i * Factor);
824      }
825
826      unsigned InsSubCost = 0;
827      for (unsigned i = 0; i < NumSubElts; i++)
828        InsSubCost += static_cast<T *>(this)->getVectorInstrCost(
829            Instruction::InsertElement, SubVT, i);
830
831      Cost += Indices.size() * InsSubCost;
832    } else {
833      // The interleave cost is extract all elements from sub vectors, and
834      // insert them into the wide vector.
835      //
836      // E.g. An interleaved store of factor 2:
837      //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
838      //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
839      // The cost is estimated as extract all elements from both <4 x i32>
840      // vectors and insert into the <8 x i32> vector.
841
842      unsigned ExtSubCost = 0;
843      for (unsigned i = 0; i < NumSubElts; i++)
844        ExtSubCost += static_cast<T *>(this)->getVectorInstrCost(
845            Instruction::ExtractElement, SubVT, i);
846      Cost += ExtSubCost * Factor;
847
848      for (unsigned i = 0; i < NumElts; i++)
849        Cost += static_cast<T *>(this)
850                    ->getVectorInstrCost(Instruction::InsertElement, VT, i);
851    }
852
853    return Cost;
854  }
855
856  /// Get intrinsic cost based on arguments.
857  unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
858                                 ArrayRef<Value *> Args, FastMathFlags FMF,
859                                 unsigned VF = 1) {
860    unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
861    assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
862
863    switch (IID) {
864    default: {
865      // Assume that we need to scalarize this intrinsic.
866      SmallVector<Type *, 4> Types;
867      for (Value *Op : Args) {
868        Type *OpTy = Op->getType();
869        assert(VF == 1 || !OpTy->isVectorTy());
870        Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF));
871      }
872
873      if (VF > 1 && !RetTy->isVoidTy())
874        RetTy = VectorType::get(RetTy, VF);
875
876      // Compute the scalarization overhead based on Args for a vector
877      // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
878      // CostModel will pass a vector RetTy and VF is 1.
879      unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
880      if (RetVF > 1 || VF > 1) {
881        ScalarizationCost = 0;
882        if (!RetTy->isVoidTy())
883          ScalarizationCost += getScalarizationOverhead(RetTy, true, false);
884        ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
885      }
886
887      return static_cast<T *>(this)->
888        getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost);
889    }
890    case Intrinsic::masked_scatter: {
891      assert(VF == 1 && "Can't vectorize types here.");
892      Value *Mask = Args[3];
893      bool VarMask = !isa<Constant>(Mask);
894      unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
895      return
896        static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store,
897                                                       Args[0]->getType(),
898                                                       Args[1], VarMask,
899                                                       Alignment);
900    }
901    case Intrinsic::masked_gather: {
902      assert(VF == 1 && "Can't vectorize types here.");
903      Value *Mask = Args[2];
904      bool VarMask = !isa<Constant>(Mask);
905      unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
906      return
907        static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load,
908                                                       RetTy, Args[0], VarMask,
909                                                       Alignment);
910    }
911    }
912  }
913
914  /// Get intrinsic cost based on argument types.
915  /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
916  /// cost of scalarizing the arguments and the return value will be computed
917  /// based on types.
918  unsigned getIntrinsicInstrCost(
919      Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF,
920      unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) {
921    SmallVector<unsigned, 2> ISDs;
922    unsigned SingleCallCost = 10; // Library call cost. Make it expensive.
923    switch (IID) {
924    default: {
925      // Assume that we need to scalarize this intrinsic.
926      unsigned ScalarizationCost = ScalarizationCostPassed;
927      unsigned ScalarCalls = 1;
928      Type *ScalarRetTy = RetTy;
929      if (RetTy->isVectorTy()) {
930        if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
931          ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
932        ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
933        ScalarRetTy = RetTy->getScalarType();
934      }
935      SmallVector<Type *, 4> ScalarTys;
936      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
937        Type *Ty = Tys[i];
938        if (Ty->isVectorTy()) {
939          if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
940            ScalarizationCost += getScalarizationOverhead(Ty, false, true);
941          ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements());
942          Ty = Ty->getScalarType();
943        }
944        ScalarTys.push_back(Ty);
945      }
946      if (ScalarCalls == 1)
947        return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
948
949      unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
950          IID, ScalarRetTy, ScalarTys, FMF);
951
952      return ScalarCalls * ScalarCost + ScalarizationCost;
953    }
954    // Look for intrinsics that can be lowered directly or turned into a scalar
955    // intrinsic call.
956    case Intrinsic::sqrt:
957      ISDs.push_back(ISD::FSQRT);
958      break;
959    case Intrinsic::sin:
960      ISDs.push_back(ISD::FSIN);
961      break;
962    case Intrinsic::cos:
963      ISDs.push_back(ISD::FCOS);
964      break;
965    case Intrinsic::exp:
966      ISDs.push_back(ISD::FEXP);
967      break;
968    case Intrinsic::exp2:
969      ISDs.push_back(ISD::FEXP2);
970      break;
971    case Intrinsic::log:
972      ISDs.push_back(ISD::FLOG);
973      break;
974    case Intrinsic::log10:
975      ISDs.push_back(ISD::FLOG10);
976      break;
977    case Intrinsic::log2:
978      ISDs.push_back(ISD::FLOG2);
979      break;
980    case Intrinsic::fabs:
981      ISDs.push_back(ISD::FABS);
982      break;
983    case Intrinsic::minnum:
984      ISDs.push_back(ISD::FMINNUM);
985      if (FMF.noNaNs())
986        ISDs.push_back(ISD::FMINNAN);
987      break;
988    case Intrinsic::maxnum:
989      ISDs.push_back(ISD::FMAXNUM);
990      if (FMF.noNaNs())
991        ISDs.push_back(ISD::FMAXNAN);
992      break;
993    case Intrinsic::copysign:
994      ISDs.push_back(ISD::FCOPYSIGN);
995      break;
996    case Intrinsic::floor:
997      ISDs.push_back(ISD::FFLOOR);
998      break;
999    case Intrinsic::ceil:
1000      ISDs.push_back(ISD::FCEIL);
1001      break;
1002    case Intrinsic::trunc:
1003      ISDs.push_back(ISD::FTRUNC);
1004      break;
1005    case Intrinsic::nearbyint:
1006      ISDs.push_back(ISD::FNEARBYINT);
1007      break;
1008    case Intrinsic::rint:
1009      ISDs.push_back(ISD::FRINT);
1010      break;
1011    case Intrinsic::round:
1012      ISDs.push_back(ISD::FROUND);
1013      break;
1014    case Intrinsic::pow:
1015      ISDs.push_back(ISD::FPOW);
1016      break;
1017    case Intrinsic::fma:
1018      ISDs.push_back(ISD::FMA);
1019      break;
1020    case Intrinsic::fmuladd:
1021      ISDs.push_back(ISD::FMA);
1022      break;
1023    // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1024    case Intrinsic::lifetime_start:
1025    case Intrinsic::lifetime_end:
1026      return 0;
1027    case Intrinsic::masked_store:
1028      return static_cast<T *>(this)
1029          ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0);
1030    case Intrinsic::masked_load:
1031      return static_cast<T *>(this)
1032          ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
1033    case Intrinsic::ctpop:
1034      ISDs.push_back(ISD::CTPOP);
1035      // In case of legalization use TCC_Expensive. This is cheaper than a
1036      // library call but still not a cheap instruction.
1037      SingleCallCost = TargetTransformInfo::TCC_Expensive;
1038      break;
1039    // FIXME: ctlz, cttz, ...
1040    }
1041
1042    const TargetLoweringBase *TLI = getTLI();
1043    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1044
1045    SmallVector<unsigned, 2> LegalCost;
1046    SmallVector<unsigned, 2> CustomCost;
1047    for (unsigned ISD : ISDs) {
1048      if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1049        if (IID == Intrinsic::fabs && TLI->isFAbsFree(LT.second)) {
1050          return 0;
1051        }
1052
1053        // The operation is legal. Assume it costs 1.
1054        // If the type is split to multiple registers, assume that there is some
1055        // overhead to this.
1056        // TODO: Once we have extract/insert subvector cost we need to use them.
1057        if (LT.first > 1)
1058          LegalCost.push_back(LT.first * 2);
1059        else
1060          LegalCost.push_back(LT.first * 1);
1061      } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1062        // If the operation is custom lowered then assume
1063        // that the code is twice as expensive.
1064        CustomCost.push_back(LT.first * 2);
1065      }
1066    }
1067
1068    auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1069    if (MinLegalCostI != LegalCost.end())
1070      return *MinLegalCostI;
1071
1072    auto MinCustomCostI = std::min_element(CustomCost.begin(), CustomCost.end());
1073    if (MinCustomCostI != CustomCost.end())
1074      return *MinCustomCostI;
1075
1076    // If we can't lower fmuladd into an FMA estimate the cost as a floating
1077    // point mul followed by an add.
1078    if (IID == Intrinsic::fmuladd)
1079      return static_cast<T *>(this)
1080                 ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
1081             static_cast<T *>(this)
1082                 ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
1083
1084    // Else, assume that we need to scalarize this intrinsic. For math builtins
1085    // this will emit a costly libcall, adding call overhead and spills. Make it
1086    // very expensive.
1087    if (RetTy->isVectorTy()) {
1088      unsigned ScalarizationCost =
1089          ((ScalarizationCostPassed != std::numeric_limits<unsigned>::max())
1090               ? ScalarizationCostPassed
1091               : getScalarizationOverhead(RetTy, true, false));
1092      unsigned ScalarCalls = RetTy->getVectorNumElements();
1093      SmallVector<Type *, 4> ScalarTys;
1094      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1095        Type *Ty = Tys[i];
1096        if (Ty->isVectorTy())
1097          Ty = Ty->getScalarType();
1098        ScalarTys.push_back(Ty);
1099      }
1100      unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost(
1101          IID, RetTy->getScalarType(), ScalarTys, FMF);
1102      for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1103        if (Tys[i]->isVectorTy()) {
1104          if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
1105            ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
1106          ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements());
1107        }
1108      }
1109
1110      return ScalarCalls * ScalarCost + ScalarizationCost;
1111    }
1112
1113    // This is going to be turned into a library call, make it expensive.
1114    return SingleCallCost;
1115  }
1116
1117  /// \brief Compute a cost of the given call instruction.
1118  ///
1119  /// Compute the cost of calling function F with return type RetTy and
1120  /// argument types Tys. F might be nullptr, in this case the cost of an
1121  /// arbitrary call with the specified signature will be returned.
1122  /// This is used, for instance,  when we estimate call of a vector
1123  /// counterpart of the given function.
1124  /// \param F Called function, might be nullptr.
1125  /// \param RetTy Return value types.
1126  /// \param Tys Argument types.
1127  /// \returns The cost of Call instruction.
1128  unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) {
1129    return 10;
1130  }
1131
1132  unsigned getNumberOfParts(Type *Tp) {
1133    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp);
1134    return LT.first;
1135  }
1136
1137  unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *,
1138                                     const SCEV *) {
1139    return 0;
1140  }
1141
1142  /// Try to calculate arithmetic and shuffle op costs for reduction operations.
1143  /// We're assuming that reduction operation are performing the following way:
1144  /// 1. Non-pairwise reduction
1145  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1146  /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1147  ///            \----------------v-------------/  \----------v------------/
1148  ///                            n/2 elements               n/2 elements
1149  /// %red1 = op <n x t> %val, <n x t> val1
1150  /// After this operation we have a vector %red1 where only the first n/2
1151  /// elements are meaningful, the second n/2 elements are undefined and can be
1152  /// dropped. All other operations are actually working with the vector of
1153  /// length n/2, not n, though the real vector length is still n.
1154  /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
1155  /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
1156  ///            \----------------v-------------/  \----------v------------/
1157  ///                            n/4 elements               3*n/4 elements
1158  /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
1159  /// length n/2, the resulting vector has length n/4 etc.
1160  /// 2. Pairwise reduction:
1161  /// Everything is the same except for an additional shuffle operation which
1162  /// is used to produce operands for pairwise kind of reductions.
1163  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1164  /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
1165  ///            \-------------v----------/  \----------v------------/
1166  ///                   n/2 elements               n/2 elements
1167  /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
1168  /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
1169  ///            \-------------v----------/  \----------v------------/
1170  ///                   n/2 elements               n/2 elements
1171  /// %red1 = op <n x t> %val1, <n x t> val2
1172  /// Again, the operation is performed on <n x t> vector, but the resulting
1173  /// vector %red1 is <n/2 x t> vector.
1174  ///
1175  /// The cost model should take into account that the actual length of the
1176  /// vector is reduced on each iteration.
1177  unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty,
1178                                      bool IsPairwise) {
1179    assert(Ty->isVectorTy() && "Expect a vector type");
1180    Type *ScalarTy = Ty->getVectorElementType();
1181    unsigned NumVecElts = Ty->getVectorNumElements();
1182    unsigned NumReduxLevels = Log2_32(NumVecElts);
1183    unsigned ArithCost = 0;
1184    unsigned ShuffleCost = 0;
1185    auto *ConcreteTTI = static_cast<T *>(this);
1186    std::pair<unsigned, MVT> LT =
1187        ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
1188    unsigned LongVectorCount = 0;
1189    unsigned MVTLen =
1190        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1191    while (NumVecElts > MVTLen) {
1192      NumVecElts /= 2;
1193      // Assume the pairwise shuffles add a cost.
1194      ShuffleCost += (IsPairwise + 1) *
1195                     ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1196                                                 NumVecElts, Ty);
1197      ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
1198      Ty = VectorType::get(ScalarTy, NumVecElts);
1199      ++LongVectorCount;
1200    }
1201    // The minimal length of the vector is limited by the real length of vector
1202    // operations performed on the current platform. That's why several final
1203    // reduction operations are performed on the vectors with the same
1204    // architecture-dependent length.
1205    ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
1206                   ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1207                                               NumVecElts, Ty);
1208    ArithCost += (NumReduxLevels - LongVectorCount) *
1209                 ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
1210    return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
1211  }
1212
1213  /// Try to calculate op costs for min/max reduction operations.
1214  /// \param CondTy Conditional type for the Select instruction.
1215  unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise,
1216                                  bool) {
1217    assert(Ty->isVectorTy() && "Expect a vector type");
1218    Type *ScalarTy = Ty->getVectorElementType();
1219    Type *ScalarCondTy = CondTy->getVectorElementType();
1220    unsigned NumVecElts = Ty->getVectorNumElements();
1221    unsigned NumReduxLevels = Log2_32(NumVecElts);
1222    unsigned CmpOpcode;
1223    if (Ty->isFPOrFPVectorTy()) {
1224      CmpOpcode = Instruction::FCmp;
1225    } else {
1226      assert(Ty->isIntOrIntVectorTy() &&
1227             "expecting floating point or integer type for min/max reduction");
1228      CmpOpcode = Instruction::ICmp;
1229    }
1230    unsigned MinMaxCost = 0;
1231    unsigned ShuffleCost = 0;
1232    auto *ConcreteTTI = static_cast<T *>(this);
1233    std::pair<unsigned, MVT> LT =
1234        ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
1235    unsigned LongVectorCount = 0;
1236    unsigned MVTLen =
1237        LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1238    while (NumVecElts > MVTLen) {
1239      NumVecElts /= 2;
1240      // Assume the pairwise shuffles add a cost.
1241      ShuffleCost += (IsPairwise + 1) *
1242                     ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1243                                                 NumVecElts, Ty);
1244      MinMaxCost +=
1245          ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
1246          ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
1247                                          nullptr);
1248      Ty = VectorType::get(ScalarTy, NumVecElts);
1249      CondTy = VectorType::get(ScalarCondTy, NumVecElts);
1250      ++LongVectorCount;
1251    }
1252    // The minimal length of the vector is limited by the real length of vector
1253    // operations performed on the current platform. That's why several final
1254    // reduction opertions are perfomed on the vectors with the same
1255    // architecture-dependent length.
1256    ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
1257                   ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
1258                                               NumVecElts, Ty);
1259    MinMaxCost +=
1260        (NumReduxLevels - LongVectorCount) *
1261        (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
1262         ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
1263                                         nullptr));
1264    // Need 3 extractelement instructions for scalarization + an additional
1265    // scalar select instruction.
1266    return ShuffleCost + MinMaxCost +
1267           3 * getScalarizationOverhead(Ty, /*Insert=*/false,
1268                                        /*Extract=*/true) +
1269           ConcreteTTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
1270                                           ScalarCondTy, nullptr);
1271  }
1272
1273  unsigned getVectorSplitCost() { return 1; }
1274
1275  /// @}
1276};
1277
1278/// \brief Concrete BasicTTIImpl that can be used if no further customization
1279/// is needed.
1280class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
1281  using BaseT = BasicTTIImplBase<BasicTTIImpl>;
1282
1283  friend class BasicTTIImplBase<BasicTTIImpl>;
1284
1285  const TargetSubtargetInfo *ST;
1286  const TargetLoweringBase *TLI;
1287
1288  const TargetSubtargetInfo *getST() const { return ST; }
1289  const TargetLoweringBase *getTLI() const { return TLI; }
1290
1291public:
1292  explicit BasicTTIImpl(const TargetMachine *ST, const Function &F);
1293};
1294
1295} // end namespace llvm
1296
1297#endif // LLVM_CODEGEN_BASICTTIIMPL_H
1298