AArch64TargetTransformInfo.cpp revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This file implements a TargetTransformInfo analysis pass specific to the 11/// AArch64 target machine. It uses the target's detailed information to provide 12/// more precise answers to certain TTI queries, while letting the target 13/// independent and default TTI implementations handle the rest. 14/// 15//===----------------------------------------------------------------------===// 16 17#include "AArch64.h" 18#include "AArch64TargetMachine.h" 19#include "MCTargetDesc/AArch64AddressingModes.h" 20#include "llvm/Analysis/TargetTransformInfo.h" 21#include "llvm/Support/Debug.h" 22#include "llvm/Target/CostTable.h" 23#include "llvm/Target/TargetLowering.h" 24#include <algorithm> 25using namespace llvm; 26 27#define DEBUG_TYPE "aarch64tti" 28 29// Declare the pass initialization routine locally as target-specific passes 30// don't have a target-wide initialization entry point, and so we rely on the 31// pass constructor initialization. 32namespace llvm { 33void initializeAArch64TTIPass(PassRegistry &); 34} 35 36namespace { 37 38class AArch64TTI final : public ImmutablePass, public TargetTransformInfo { 39 const AArch64TargetMachine *TM; 40 const AArch64Subtarget *ST; 41 const AArch64TargetLowering *TLI; 42 43 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 44 /// are set if the result needs to be inserted and/or extracted from vectors. 45 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 46 47public: 48 AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { 49 llvm_unreachable("This pass cannot be directly constructed"); 50 } 51 52 AArch64TTI(const AArch64TargetMachine *TM) 53 : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), 54 TLI(TM->getTargetLowering()) { 55 initializeAArch64TTIPass(*PassRegistry::getPassRegistry()); 56 } 57 58 void initializePass() override { pushTTIStack(this); } 59 60 void getAnalysisUsage(AnalysisUsage &AU) const override { 61 TargetTransformInfo::getAnalysisUsage(AU); 62 } 63 64 /// Pass identification. 65 static char ID; 66 67 /// Provide necessary pointer adjustments for the two base classes. 68 void *getAdjustedAnalysisPointer(const void *ID) override { 69 if (ID == &TargetTransformInfo::ID) 70 return (TargetTransformInfo *)this; 71 return this; 72 } 73 74 /// \name Scalar TTI Implementations 75 /// @{ 76 unsigned getIntImmCost(int64_t Val) const; 77 unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; 78 unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 79 Type *Ty) const override; 80 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 81 Type *Ty) const override; 82 PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; 83 84 /// @} 85 86 /// \name Vector TTI Implementations 87 /// @{ 88 89 unsigned getNumberOfRegisters(bool Vector) const override { 90 if (Vector) { 91 if (ST->hasNEON()) 92 return 32; 93 return 0; 94 } 95 return 31; 96 } 97 98 unsigned getRegisterBitWidth(bool Vector) const override { 99 if (Vector) { 100 if (ST->hasNEON()) 101 return 128; 102 return 0; 103 } 104 return 64; 105 } 106 107 unsigned getMaximumUnrollFactor() const override { return 2; } 108 109 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const 110 override; 111 112 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const 113 override; 114 115 unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, 116 OperandValueKind Opd1Info = OK_AnyValue, 117 OperandValueKind Opd2Info = OK_AnyValue) const 118 override; 119 120 unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override; 121 122 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const 123 override; 124 125 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 126 unsigned AddressSpace) const override; 127 /// @} 128}; 129 130} // end anonymous namespace 131 132INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti", 133 "AArch64 Target Transform Info", true, true, false) 134char AArch64TTI::ID = 0; 135 136ImmutablePass * 137llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) { 138 return new AArch64TTI(TM); 139} 140 141/// \brief Calculate the cost of materializing a 64-bit value. This helper 142/// method might only calculate a fraction of a larger immediate. Therefore it 143/// is valid to return a cost of ZERO. 144unsigned AArch64TTI::getIntImmCost(int64_t Val) const { 145 // Check if the immediate can be encoded within an instruction. 146 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 147 return 0; 148 149 if (Val < 0) 150 Val = ~Val; 151 152 // Calculate how many moves we will need to materialize this constant. 153 unsigned LZ = countLeadingZeros((uint64_t)Val); 154 return (64 - LZ + 15) / 16; 155} 156 157/// \brief Calculate the cost of materializing the given constant. 158unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { 159 assert(Ty->isIntegerTy()); 160 161 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 162 if (BitSize == 0) 163 return ~0U; 164 165 // Sign-extend all constants to a multiple of 64-bit. 166 APInt ImmVal = Imm; 167 if (BitSize & 0x3f) 168 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 169 170 // Split the constant into 64-bit chunks and calculate the cost for each 171 // chunk. 172 unsigned Cost = 0; 173 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 174 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 175 int64_t Val = Tmp.getSExtValue(); 176 Cost += getIntImmCost(Val); 177 } 178 // We need at least one instruction to materialze the constant. 179 return std::max(1U, Cost); 180} 181 182unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx, 183 const APInt &Imm, Type *Ty) const { 184 assert(Ty->isIntegerTy()); 185 186 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 187 // There is no cost model for constants with a bit size of 0. Return TCC_Free 188 // here, so that constant hoisting will ignore this constant. 189 if (BitSize == 0) 190 return TCC_Free; 191 192 unsigned ImmIdx = ~0U; 193 switch (Opcode) { 194 default: 195 return TCC_Free; 196 case Instruction::GetElementPtr: 197 // Always hoist the base address of a GetElementPtr. 198 if (Idx == 0) 199 return 2 * TCC_Basic; 200 return TCC_Free; 201 case Instruction::Store: 202 ImmIdx = 0; 203 break; 204 case Instruction::Add: 205 case Instruction::Sub: 206 case Instruction::Mul: 207 case Instruction::UDiv: 208 case Instruction::SDiv: 209 case Instruction::URem: 210 case Instruction::SRem: 211 case Instruction::And: 212 case Instruction::Or: 213 case Instruction::Xor: 214 case Instruction::ICmp: 215 ImmIdx = 1; 216 break; 217 // Always return TCC_Free for the shift value of a shift instruction. 218 case Instruction::Shl: 219 case Instruction::LShr: 220 case Instruction::AShr: 221 if (Idx == 1) 222 return TCC_Free; 223 break; 224 case Instruction::Trunc: 225 case Instruction::ZExt: 226 case Instruction::SExt: 227 case Instruction::IntToPtr: 228 case Instruction::PtrToInt: 229 case Instruction::BitCast: 230 case Instruction::PHI: 231 case Instruction::Call: 232 case Instruction::Select: 233 case Instruction::Ret: 234 case Instruction::Load: 235 break; 236 } 237 238 if (Idx == ImmIdx) { 239 unsigned NumConstants = (BitSize + 63) / 64; 240 unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); 241 return (Cost <= NumConstants * TCC_Basic) 242 ? static_cast<unsigned>(TCC_Free) : Cost; 243 } 244 return AArch64TTI::getIntImmCost(Imm, Ty); 245} 246 247unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 248 const APInt &Imm, Type *Ty) const { 249 assert(Ty->isIntegerTy()); 250 251 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 252 // There is no cost model for constants with a bit size of 0. Return TCC_Free 253 // here, so that constant hoisting will ignore this constant. 254 if (BitSize == 0) 255 return TCC_Free; 256 257 switch (IID) { 258 default: 259 return TCC_Free; 260 case Intrinsic::sadd_with_overflow: 261 case Intrinsic::uadd_with_overflow: 262 case Intrinsic::ssub_with_overflow: 263 case Intrinsic::usub_with_overflow: 264 case Intrinsic::smul_with_overflow: 265 case Intrinsic::umul_with_overflow: 266 if (Idx == 1) { 267 unsigned NumConstants = (BitSize + 63) / 64; 268 unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); 269 return (Cost <= NumConstants * TCC_Basic) 270 ? static_cast<unsigned>(TCC_Free) : Cost; 271 } 272 break; 273 case Intrinsic::experimental_stackmap: 274 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 275 return TCC_Free; 276 break; 277 case Intrinsic::experimental_patchpoint_void: 278 case Intrinsic::experimental_patchpoint_i64: 279 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 280 return TCC_Free; 281 break; 282 } 283 return AArch64TTI::getIntImmCost(Imm, Ty); 284} 285 286AArch64TTI::PopcntSupportKind 287AArch64TTI::getPopcntSupport(unsigned TyWidth) const { 288 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 289 if (TyWidth == 32 || TyWidth == 64) 290 return PSK_FastHardware; 291 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 292 return PSK_Software; 293} 294 295unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst, 296 Type *Src) const { 297 int ISD = TLI->InstructionOpcodeToISD(Opcode); 298 assert(ISD && "Invalid opcode"); 299 300 EVT SrcTy = TLI->getValueType(Src); 301 EVT DstTy = TLI->getValueType(Dst); 302 303 if (!SrcTy.isSimple() || !DstTy.isSimple()) 304 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 305 306 static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = { 307 // LowerVectorINT_TO_FP: 308 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 309 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 }, 310 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 }, 311 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, 312 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 313 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 314 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 }, 315 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 }, 316 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, 317 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 318 // LowerVectorFP_TO_INT 319 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 320 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 321 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 322 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 323 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 }, 324 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 }, 325 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 }, 326 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 }, 327 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 }, 328 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 }, 329 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 }, 330 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 }, 331 }; 332 333 int Idx = ConvertCostTableLookup<MVT>( 334 ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(), 335 SrcTy.getSimpleVT()); 336 if (Idx != -1) 337 return ConversionTbl[Idx].Cost; 338 339 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 340} 341 342unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val, 343 unsigned Index) const { 344 assert(Val->isVectorTy() && "This must be a vector type"); 345 346 if (Index != -1U) { 347 // Legalize the type. 348 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 349 350 // This type is legalized to a scalar type. 351 if (!LT.second.isVector()) 352 return 0; 353 354 // The type may be split. Normalize the index to the new type. 355 unsigned Width = LT.second.getVectorNumElements(); 356 Index = Index % Width; 357 358 // The element at index zero is already inside the vector. 359 if (Index == 0) 360 return 0; 361 } 362 363 // All other insert/extracts cost this much. 364 return 2; 365} 366 367unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 368 OperandValueKind Opd1Info, 369 OperandValueKind Opd2Info) const { 370 // Legalize the type. 371 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 372 373 int ISD = TLI->InstructionOpcodeToISD(Opcode); 374 375 switch (ISD) { 376 default: 377 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info, 378 Opd2Info); 379 case ISD::ADD: 380 case ISD::MUL: 381 case ISD::XOR: 382 case ISD::OR: 383 case ISD::AND: 384 // These nodes are marked as 'custom' for combining purposes only. 385 // We know that they are legal. See LowerAdd in ISelLowering. 386 return 1 * LT.first; 387 } 388} 389 390unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { 391 // Address computations in vectorized code with non-consecutive addresses will 392 // likely result in more instructions compared to scalar code where the 393 // computation can more often be merged into the index mode. The resulting 394 // extra micro-ops can significantly decrease throughput. 395 unsigned NumVectorInstToHideOverhead = 10; 396 397 if (Ty->isVectorTy() && IsComplex) 398 return NumVectorInstToHideOverhead; 399 400 // In many cases the address computation is not merged into the instruction 401 // addressing mode. 402 return 1; 403} 404 405unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 406 Type *CondTy) const { 407 408 int ISD = TLI->InstructionOpcodeToISD(Opcode); 409 // We don't lower vector selects well that are wider than the register width. 410 if (ValTy->isVectorTy() && ISD == ISD::SELECT) { 411 // We would need this many instructions to hide the scalarization happening. 412 unsigned AmortizationCost = 20; 413 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 414 VectorSelectTbl[] = { 415 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, 416 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost }, 417 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost }, 418 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 419 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 420 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 421 }; 422 423 EVT SelCondTy = TLI->getValueType(CondTy); 424 EVT SelValTy = TLI->getValueType(ValTy); 425 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 426 int Idx = 427 ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), 428 SelValTy.getSimpleVT()); 429 if (Idx != -1) 430 return VectorSelectTbl[Idx].Cost; 431 } 432 } 433 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 434} 435 436unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src, 437 unsigned Alignment, 438 unsigned AddressSpace) const { 439 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 440 441 if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && 442 Src->getVectorElementType()->isIntegerTy(64)) { 443 // Unaligned stores are extremely inefficient. We don't split 444 // unaligned v2i64 stores because the negative impact that has shown in 445 // practice on inlined memcpy code. 446 // We make v2i64 stores expensive so that we will only vectorize if there 447 // are 6 other instructions getting vectorized. 448 unsigned AmortizationCost = 6; 449 450 return LT.first * 2 * AmortizationCost; 451 } 452 453 if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) && 454 Src->getVectorNumElements() < 8) { 455 // We scalarize the loads/stores because there is not v.4b register and we 456 // have to promote the elements to v.4h. 457 unsigned NumVecElts = Src->getVectorNumElements(); 458 unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; 459 // We generate 2 instructions per vector element. 460 return NumVectorizableInstsToAmortize * NumVecElts * 2; 461 } 462 463 return LT.first; 464} 465