AArch64TargetTransformInfo.cpp revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9/// \file
10/// This file implements a TargetTransformInfo analysis pass specific to the
11/// AArch64 target machine. It uses the target's detailed information to provide
12/// more precise answers to certain TTI queries, while letting the target
13/// independent and default TTI implementations handle the rest.
14///
15//===----------------------------------------------------------------------===//
16
17#include "AArch64.h"
18#include "AArch64TargetMachine.h"
19#include "MCTargetDesc/AArch64AddressingModes.h"
20#include "llvm/Analysis/TargetTransformInfo.h"
21#include "llvm/Support/Debug.h"
22#include "llvm/Target/CostTable.h"
23#include "llvm/Target/TargetLowering.h"
24#include <algorithm>
25using namespace llvm;
26
27#define DEBUG_TYPE "aarch64tti"
28
29// Declare the pass initialization routine locally as target-specific passes
30// don't have a target-wide initialization entry point, and so we rely on the
31// pass constructor initialization.
32namespace llvm {
33void initializeAArch64TTIPass(PassRegistry &);
34}
35
36namespace {
37
38class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
39  const AArch64TargetMachine *TM;
40  const AArch64Subtarget *ST;
41  const AArch64TargetLowering *TLI;
42
43  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
44  /// are set if the result needs to be inserted and/or extracted from vectors.
45  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
46
47public:
48  AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
49    llvm_unreachable("This pass cannot be directly constructed");
50  }
51
52  AArch64TTI(const AArch64TargetMachine *TM)
53      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
54        TLI(TM->getTargetLowering()) {
55    initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
56  }
57
58  void initializePass() override { pushTTIStack(this); }
59
60  void getAnalysisUsage(AnalysisUsage &AU) const override {
61    TargetTransformInfo::getAnalysisUsage(AU);
62  }
63
64  /// Pass identification.
65  static char ID;
66
67  /// Provide necessary pointer adjustments for the two base classes.
68  void *getAdjustedAnalysisPointer(const void *ID) override {
69    if (ID == &TargetTransformInfo::ID)
70      return (TargetTransformInfo *)this;
71    return this;
72  }
73
74  /// \name Scalar TTI Implementations
75  /// @{
76  unsigned getIntImmCost(int64_t Val) const;
77  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
78  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
79                         Type *Ty) const override;
80  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
81                         Type *Ty) const override;
82  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
83
84  /// @}
85
86  /// \name Vector TTI Implementations
87  /// @{
88
89  unsigned getNumberOfRegisters(bool Vector) const override {
90    if (Vector) {
91      if (ST->hasNEON())
92        return 32;
93      return 0;
94    }
95    return 31;
96  }
97
98  unsigned getRegisterBitWidth(bool Vector) const override {
99    if (Vector) {
100      if (ST->hasNEON())
101        return 128;
102      return 0;
103    }
104    return 64;
105  }
106
107  unsigned getMaximumUnrollFactor() const override { return 2; }
108
109  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
110      override;
111
112  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
113      override;
114
115  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
116                                  OperandValueKind Opd1Info = OK_AnyValue,
117                                  OperandValueKind Opd2Info = OK_AnyValue) const
118      override;
119
120  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
121
122  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
123      override;
124
125  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
126                           unsigned AddressSpace) const override;
127  /// @}
128};
129
130} // end anonymous namespace
131
132INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti",
133                   "AArch64 Target Transform Info", true, true, false)
134char AArch64TTI::ID = 0;
135
136ImmutablePass *
137llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
138  return new AArch64TTI(TM);
139}
140
141/// \brief Calculate the cost of materializing a 64-bit value. This helper
142/// method might only calculate a fraction of a larger immediate. Therefore it
143/// is valid to return a cost of ZERO.
144unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
145  // Check if the immediate can be encoded within an instruction.
146  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
147    return 0;
148
149  if (Val < 0)
150    Val = ~Val;
151
152  // Calculate how many moves we will need to materialize this constant.
153  unsigned LZ = countLeadingZeros((uint64_t)Val);
154  return (64 - LZ + 15) / 16;
155}
156
157/// \brief Calculate the cost of materializing the given constant.
158unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
159  assert(Ty->isIntegerTy());
160
161  unsigned BitSize = Ty->getPrimitiveSizeInBits();
162  if (BitSize == 0)
163    return ~0U;
164
165  // Sign-extend all constants to a multiple of 64-bit.
166  APInt ImmVal = Imm;
167  if (BitSize & 0x3f)
168    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
169
170  // Split the constant into 64-bit chunks and calculate the cost for each
171  // chunk.
172  unsigned Cost = 0;
173  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
174    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
175    int64_t Val = Tmp.getSExtValue();
176    Cost += getIntImmCost(Val);
177  }
178  // We need at least one instruction to materialze the constant.
179  return std::max(1U, Cost);
180}
181
182unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
183                                 const APInt &Imm, Type *Ty) const {
184  assert(Ty->isIntegerTy());
185
186  unsigned BitSize = Ty->getPrimitiveSizeInBits();
187  // There is no cost model for constants with a bit size of 0. Return TCC_Free
188  // here, so that constant hoisting will ignore this constant.
189  if (BitSize == 0)
190    return TCC_Free;
191
192  unsigned ImmIdx = ~0U;
193  switch (Opcode) {
194  default:
195    return TCC_Free;
196  case Instruction::GetElementPtr:
197    // Always hoist the base address of a GetElementPtr.
198    if (Idx == 0)
199      return 2 * TCC_Basic;
200    return TCC_Free;
201  case Instruction::Store:
202    ImmIdx = 0;
203    break;
204  case Instruction::Add:
205  case Instruction::Sub:
206  case Instruction::Mul:
207  case Instruction::UDiv:
208  case Instruction::SDiv:
209  case Instruction::URem:
210  case Instruction::SRem:
211  case Instruction::And:
212  case Instruction::Or:
213  case Instruction::Xor:
214  case Instruction::ICmp:
215    ImmIdx = 1;
216    break;
217  // Always return TCC_Free for the shift value of a shift instruction.
218  case Instruction::Shl:
219  case Instruction::LShr:
220  case Instruction::AShr:
221    if (Idx == 1)
222      return TCC_Free;
223    break;
224  case Instruction::Trunc:
225  case Instruction::ZExt:
226  case Instruction::SExt:
227  case Instruction::IntToPtr:
228  case Instruction::PtrToInt:
229  case Instruction::BitCast:
230  case Instruction::PHI:
231  case Instruction::Call:
232  case Instruction::Select:
233  case Instruction::Ret:
234  case Instruction::Load:
235    break;
236  }
237
238  if (Idx == ImmIdx) {
239    unsigned NumConstants = (BitSize + 63) / 64;
240    unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
241    return (Cost <= NumConstants * TCC_Basic)
242      ? static_cast<unsigned>(TCC_Free) : Cost;
243  }
244  return AArch64TTI::getIntImmCost(Imm, Ty);
245}
246
247unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
248                                 const APInt &Imm, Type *Ty) const {
249  assert(Ty->isIntegerTy());
250
251  unsigned BitSize = Ty->getPrimitiveSizeInBits();
252  // There is no cost model for constants with a bit size of 0. Return TCC_Free
253  // here, so that constant hoisting will ignore this constant.
254  if (BitSize == 0)
255    return TCC_Free;
256
257  switch (IID) {
258  default:
259    return TCC_Free;
260  case Intrinsic::sadd_with_overflow:
261  case Intrinsic::uadd_with_overflow:
262  case Intrinsic::ssub_with_overflow:
263  case Intrinsic::usub_with_overflow:
264  case Intrinsic::smul_with_overflow:
265  case Intrinsic::umul_with_overflow:
266    if (Idx == 1) {
267      unsigned NumConstants = (BitSize + 63) / 64;
268      unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
269      return (Cost <= NumConstants * TCC_Basic)
270        ? static_cast<unsigned>(TCC_Free) : Cost;
271    }
272    break;
273  case Intrinsic::experimental_stackmap:
274    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
275      return TCC_Free;
276    break;
277  case Intrinsic::experimental_patchpoint_void:
278  case Intrinsic::experimental_patchpoint_i64:
279    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
280      return TCC_Free;
281    break;
282  }
283  return AArch64TTI::getIntImmCost(Imm, Ty);
284}
285
286AArch64TTI::PopcntSupportKind
287AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
288  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
289  if (TyWidth == 32 || TyWidth == 64)
290    return PSK_FastHardware;
291  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
292  return PSK_Software;
293}
294
295unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
296                                    Type *Src) const {
297  int ISD = TLI->InstructionOpcodeToISD(Opcode);
298  assert(ISD && "Invalid opcode");
299
300  EVT SrcTy = TLI->getValueType(Src);
301  EVT DstTy = TLI->getValueType(Dst);
302
303  if (!SrcTy.isSimple() || !DstTy.isSimple())
304    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
305
306  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
307    // LowerVectorINT_TO_FP:
308    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
309    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
310    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
311    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
312    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
313    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
314    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
315    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
316    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
317    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
318    // LowerVectorFP_TO_INT
319    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
320    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
321    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
322    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
323    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
324    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
325    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 },
326    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 },
327    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
328    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 },
329    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
330    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
331  };
332
333  int Idx = ConvertCostTableLookup<MVT>(
334      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
335      SrcTy.getSimpleVT());
336  if (Idx != -1)
337    return ConversionTbl[Idx].Cost;
338
339  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
340}
341
342unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
343                                      unsigned Index) const {
344  assert(Val->isVectorTy() && "This must be a vector type");
345
346  if (Index != -1U) {
347    // Legalize the type.
348    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
349
350    // This type is legalized to a scalar type.
351    if (!LT.second.isVector())
352      return 0;
353
354    // The type may be split. Normalize the index to the new type.
355    unsigned Width = LT.second.getVectorNumElements();
356    Index = Index % Width;
357
358    // The element at index zero is already inside the vector.
359    if (Index == 0)
360      return 0;
361  }
362
363  // All other insert/extracts cost this much.
364  return 2;
365}
366
367unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
368                                          OperandValueKind Opd1Info,
369                                          OperandValueKind Opd2Info) const {
370  // Legalize the type.
371  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
372
373  int ISD = TLI->InstructionOpcodeToISD(Opcode);
374
375  switch (ISD) {
376  default:
377    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
378                                                       Opd2Info);
379  case ISD::ADD:
380  case ISD::MUL:
381  case ISD::XOR:
382  case ISD::OR:
383  case ISD::AND:
384    // These nodes are marked as 'custom' for combining purposes only.
385    // We know that they are legal. See LowerAdd in ISelLowering.
386    return 1 * LT.first;
387  }
388}
389
390unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
391  // Address computations in vectorized code with non-consecutive addresses will
392  // likely result in more instructions compared to scalar code where the
393  // computation can more often be merged into the index mode. The resulting
394  // extra micro-ops can significantly decrease throughput.
395  unsigned NumVectorInstToHideOverhead = 10;
396
397  if (Ty->isVectorTy() && IsComplex)
398    return NumVectorInstToHideOverhead;
399
400  // In many cases the address computation is not merged into the instruction
401  // addressing mode.
402  return 1;
403}
404
405unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
406                                      Type *CondTy) const {
407
408  int ISD = TLI->InstructionOpcodeToISD(Opcode);
409  // We don't lower vector selects well that are wider than the register width.
410  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
411    // We would need this many instructions to hide the scalarization happening.
412    unsigned AmortizationCost = 20;
413    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
414    VectorSelectTbl[] = {
415      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
416      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
417      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
418      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
419      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
420      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
421    };
422
423    EVT SelCondTy = TLI->getValueType(CondTy);
424    EVT SelValTy = TLI->getValueType(ValTy);
425    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
426      int Idx =
427          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
428                                 SelValTy.getSimpleVT());
429      if (Idx != -1)
430        return VectorSelectTbl[Idx].Cost;
431    }
432  }
433  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
434}
435
436unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
437                                   unsigned Alignment,
438                                   unsigned AddressSpace) const {
439  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
440
441  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
442      Src->getVectorElementType()->isIntegerTy(64)) {
443    // Unaligned stores are extremely inefficient. We don't split
444    // unaligned v2i64 stores because the negative impact that has shown in
445    // practice on inlined memcpy code.
446    // We make v2i64 stores expensive so that we will only vectorize if there
447    // are 6 other instructions getting vectorized.
448    unsigned AmortizationCost = 6;
449
450    return LT.first * 2 * AmortizationCost;
451  }
452
453  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
454      Src->getVectorNumElements() < 8) {
455    // We scalarize the loads/stores because there is not v.4b register and we
456    // have to promote the elements to v.4h.
457    unsigned NumVecElts = Src->getVectorNumElements();
458    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
459    // We generate 2 instructions per vector element.
460    return NumVectorizableInstsToAmortize * NumVecElts * 2;
461  }
462
463  return LT.first;
464}
465