1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This file implements a TargetTransformInfo analysis pass specific to the 11/// AArch64 target machine. It uses the target's detailed information to provide 12/// more precise answers to certain TTI queries, while letting the target 13/// independent and default TTI implementations handle the rest. 14/// 15//===----------------------------------------------------------------------===// 16 17#include "AArch64.h" 18#include "AArch64TargetMachine.h" 19#include "MCTargetDesc/AArch64AddressingModes.h" 20#include "llvm/Analysis/TargetTransformInfo.h" 21#include "llvm/Support/Debug.h" 22#include "llvm/Target/CostTable.h" 23#include "llvm/Target/TargetLowering.h" 24#include <algorithm> 25using namespace llvm; 26 27#define DEBUG_TYPE "aarch64tti" 28 29// Declare the pass initialization routine locally as target-specific passes 30// don't have a target-wide initialization entry point, and so we rely on the 31// pass constructor initialization. 32namespace llvm { 33void initializeAArch64TTIPass(PassRegistry &); 34} 35 36namespace { 37 38class AArch64TTI final : public ImmutablePass, public TargetTransformInfo { 39 const AArch64TargetMachine *TM; 40 const AArch64Subtarget *ST; 41 const AArch64TargetLowering *TLI; 42 43 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 44 /// are set if the result needs to be inserted and/or extracted from vectors. 45 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 46 47public: 48 AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { 49 llvm_unreachable("This pass cannot be directly constructed"); 50 } 51 52 AArch64TTI(const AArch64TargetMachine *TM) 53 : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), 54 TLI(TM->getTargetLowering()) { 55 initializeAArch64TTIPass(*PassRegistry::getPassRegistry()); 56 } 57 58 void initializePass() override { pushTTIStack(this); } 59 60 void getAnalysisUsage(AnalysisUsage &AU) const override { 61 TargetTransformInfo::getAnalysisUsage(AU); 62 } 63 64 /// Pass identification. 65 static char ID; 66 67 /// Provide necessary pointer adjustments for the two base classes. 68 void *getAdjustedAnalysisPointer(const void *ID) override { 69 if (ID == &TargetTransformInfo::ID) 70 return (TargetTransformInfo *)this; 71 return this; 72 } 73 74 /// \name Scalar TTI Implementations 75 /// @{ 76 unsigned getIntImmCost(int64_t Val) const; 77 unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; 78 unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 79 Type *Ty) const override; 80 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 81 Type *Ty) const override; 82 PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; 83 84 /// @} 85 86 /// \name Vector TTI Implementations 87 /// @{ 88 89 unsigned getNumberOfRegisters(bool Vector) const override { 90 if (Vector) { 91 if (ST->hasNEON()) 92 return 32; 93 return 0; 94 } 95 return 31; 96 } 97 98 unsigned getRegisterBitWidth(bool Vector) const override { 99 if (Vector) { 100 if (ST->hasNEON()) 101 return 128; 102 return 0; 103 } 104 return 64; 105 } 106 107 unsigned getMaximumUnrollFactor() const override { return 2; } 108 109 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const 110 override; 111 112 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const 113 override; 114 115 unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, 116 OperandValueKind Opd1Info = OK_AnyValue, 117 OperandValueKind Opd2Info = OK_AnyValue) const 118 override; 119 120 unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override; 121 122 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const 123 override; 124 125 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 126 unsigned AddressSpace) const override; 127 /// @} 128}; 129 130} // end anonymous namespace 131 132INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti", 133 "AArch64 Target Transform Info", true, true, false) 134char AArch64TTI::ID = 0; 135 136ImmutablePass * 137llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) { 138 return new AArch64TTI(TM); 139} 140 141/// \brief Calculate the cost of materializing a 64-bit value. This helper 142/// method might only calculate a fraction of a larger immediate. Therefore it 143/// is valid to return a cost of ZERO. 144unsigned AArch64TTI::getIntImmCost(int64_t Val) const { 145 // Check if the immediate can be encoded within an instruction. 146 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) 147 return 0; 148 149 if (Val < 0) 150 Val = ~Val; 151 152 // Calculate how many moves we will need to materialize this constant. 153 unsigned LZ = countLeadingZeros((uint64_t)Val); 154 return (64 - LZ + 15) / 16; 155} 156 157/// \brief Calculate the cost of materializing the given constant. 158unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { 159 assert(Ty->isIntegerTy()); 160 161 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 162 if (BitSize == 0) 163 return ~0U; 164 165 // Sign-extend all constants to a multiple of 64-bit. 166 APInt ImmVal = Imm; 167 if (BitSize & 0x3f) 168 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 169 170 // Split the constant into 64-bit chunks and calculate the cost for each 171 // chunk. 172 unsigned Cost = 0; 173 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 174 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 175 int64_t Val = Tmp.getSExtValue(); 176 Cost += getIntImmCost(Val); 177 } 178 // We need at least one instruction to materialze the constant. 179 return std::max(1U, Cost); 180} 181 182unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx, 183 const APInt &Imm, Type *Ty) const { 184 assert(Ty->isIntegerTy()); 185 186 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 187 // There is no cost model for constants with a bit size of 0. Return TCC_Free 188 // here, so that constant hoisting will ignore this constant. 189 if (BitSize == 0) 190 return TCC_Free; 191 192 unsigned ImmIdx = ~0U; 193 switch (Opcode) { 194 default: 195 return TCC_Free; 196 case Instruction::GetElementPtr: 197 // Always hoist the base address of a GetElementPtr. 198 if (Idx == 0) 199 return 2 * TCC_Basic; 200 return TCC_Free; 201 case Instruction::Store: 202 ImmIdx = 0; 203 break; 204 case Instruction::Add: 205 case Instruction::Sub: 206 case Instruction::Mul: 207 case Instruction::UDiv: 208 case Instruction::SDiv: 209 case Instruction::URem: 210 case Instruction::SRem: 211 case Instruction::And: 212 case Instruction::Or: 213 case Instruction::Xor: 214 case Instruction::ICmp: 215 ImmIdx = 1; 216 break; 217 // Always return TCC_Free for the shift value of a shift instruction. 218 case Instruction::Shl: 219 case Instruction::LShr: 220 case Instruction::AShr: 221 if (Idx == 1) 222 return TCC_Free; 223 break; 224 case Instruction::Trunc: 225 case Instruction::ZExt: 226 case Instruction::SExt: 227 case Instruction::IntToPtr: 228 case Instruction::PtrToInt: 229 case Instruction::BitCast: 230 case Instruction::PHI: 231 case Instruction::Call: 232 case Instruction::Select: 233 case Instruction::Ret: 234 case Instruction::Load: 235 break; 236 } 237 238 if (Idx == ImmIdx) { 239 unsigned NumConstants = (BitSize + 63) / 64; 240 unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); 241 return (Cost <= NumConstants * TCC_Basic) 242 ? static_cast<unsigned>(TCC_Free) : Cost; 243 } 244 return AArch64TTI::getIntImmCost(Imm, Ty); 245} 246 247unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 248 const APInt &Imm, Type *Ty) const { 249 assert(Ty->isIntegerTy()); 250 251 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 252 // There is no cost model for constants with a bit size of 0. Return TCC_Free 253 // here, so that constant hoisting will ignore this constant. 254 if (BitSize == 0) 255 return TCC_Free; 256 257 switch (IID) { 258 default: 259 return TCC_Free; 260 case Intrinsic::sadd_with_overflow: 261 case Intrinsic::uadd_with_overflow: 262 case Intrinsic::ssub_with_overflow: 263 case Intrinsic::usub_with_overflow: 264 case Intrinsic::smul_with_overflow: 265 case Intrinsic::umul_with_overflow: 266 if (Idx == 1) { 267 unsigned NumConstants = (BitSize + 63) / 64; 268 unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); 269 return (Cost <= NumConstants * TCC_Basic) 270 ? static_cast<unsigned>(TCC_Free) : Cost; 271 } 272 break; 273 case Intrinsic::experimental_stackmap: 274 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 275 return TCC_Free; 276 break; 277 case Intrinsic::experimental_patchpoint_void: 278 case Intrinsic::experimental_patchpoint_i64: 279 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 280 return TCC_Free; 281 break; 282 } 283 return AArch64TTI::getIntImmCost(Imm, Ty); 284} 285 286AArch64TTI::PopcntSupportKind 287AArch64TTI::getPopcntSupport(unsigned TyWidth) const { 288 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 289 if (TyWidth == 32 || TyWidth == 64) 290 return PSK_FastHardware; 291 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. 292 return PSK_Software; 293} 294 295unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst, 296 Type *Src) const { 297 int ISD = TLI->InstructionOpcodeToISD(Opcode); 298 assert(ISD && "Invalid opcode"); 299 300 EVT SrcTy = TLI->getValueType(Src); 301 EVT DstTy = TLI->getValueType(Dst); 302 303 if (!SrcTy.isSimple() || !DstTy.isSimple()) 304 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 305 306 static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = { 307 // LowerVectorINT_TO_FP: 308 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 309 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 310 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 311 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 312 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 313 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, 314 315 // Complex: to v2f32 316 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 317 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 318 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 319 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 320 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, 321 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, 322 323 // Complex: to v4f32 324 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, 325 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 326 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 327 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 328 329 // Complex: to v2f64 330 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 331 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 332 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 333 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 334 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, 335 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 336 337 338 // LowerVectorFP_TO_INT 339 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, 340 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 341 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, 342 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, 343 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 344 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, 345 346 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). 347 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, 348 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, 349 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, 350 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, 351 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, 352 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, 353 354 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 355 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 356 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, 357 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 358 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, 359 360 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. 361 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 362 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, 363 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, 364 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 365 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, 366 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, 367 }; 368 369 int Idx = ConvertCostTableLookup<MVT>( 370 ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(), 371 SrcTy.getSimpleVT()); 372 if (Idx != -1) 373 return ConversionTbl[Idx].Cost; 374 375 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 376} 377 378unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val, 379 unsigned Index) const { 380 assert(Val->isVectorTy() && "This must be a vector type"); 381 382 if (Index != -1U) { 383 // Legalize the type. 384 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 385 386 // This type is legalized to a scalar type. 387 if (!LT.second.isVector()) 388 return 0; 389 390 // The type may be split. Normalize the index to the new type. 391 unsigned Width = LT.second.getVectorNumElements(); 392 Index = Index % Width; 393 394 // The element at index zero is already inside the vector. 395 if (Index == 0) 396 return 0; 397 } 398 399 // All other insert/extracts cost this much. 400 return 2; 401} 402 403unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 404 OperandValueKind Opd1Info, 405 OperandValueKind Opd2Info) const { 406 // Legalize the type. 407 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 408 409 int ISD = TLI->InstructionOpcodeToISD(Opcode); 410 411 switch (ISD) { 412 default: 413 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info, 414 Opd2Info); 415 case ISD::ADD: 416 case ISD::MUL: 417 case ISD::XOR: 418 case ISD::OR: 419 case ISD::AND: 420 // These nodes are marked as 'custom' for combining purposes only. 421 // We know that they are legal. See LowerAdd in ISelLowering. 422 return 1 * LT.first; 423 } 424} 425 426unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { 427 // Address computations in vectorized code with non-consecutive addresses will 428 // likely result in more instructions compared to scalar code where the 429 // computation can more often be merged into the index mode. The resulting 430 // extra micro-ops can significantly decrease throughput. 431 unsigned NumVectorInstToHideOverhead = 10; 432 433 if (Ty->isVectorTy() && IsComplex) 434 return NumVectorInstToHideOverhead; 435 436 // In many cases the address computation is not merged into the instruction 437 // addressing mode. 438 return 1; 439} 440 441unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 442 Type *CondTy) const { 443 444 int ISD = TLI->InstructionOpcodeToISD(Opcode); 445 // We don't lower vector selects well that are wider than the register width. 446 if (ValTy->isVectorTy() && ISD == ISD::SELECT) { 447 // We would need this many instructions to hide the scalarization happening. 448 unsigned AmortizationCost = 20; 449 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 450 VectorSelectTbl[] = { 451 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, 452 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost }, 453 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost }, 454 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, 455 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, 456 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } 457 }; 458 459 EVT SelCondTy = TLI->getValueType(CondTy); 460 EVT SelValTy = TLI->getValueType(ValTy); 461 if (SelCondTy.isSimple() && SelValTy.isSimple()) { 462 int Idx = 463 ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), 464 SelValTy.getSimpleVT()); 465 if (Idx != -1) 466 return VectorSelectTbl[Idx].Cost; 467 } 468 } 469 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 470} 471 472unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src, 473 unsigned Alignment, 474 unsigned AddressSpace) const { 475 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 476 477 if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && 478 Src->getVectorElementType()->isIntegerTy(64)) { 479 // Unaligned stores are extremely inefficient. We don't split 480 // unaligned v2i64 stores because the negative impact that has shown in 481 // practice on inlined memcpy code. 482 // We make v2i64 stores expensive so that we will only vectorize if there 483 // are 6 other instructions getting vectorized. 484 unsigned AmortizationCost = 6; 485 486 return LT.first * 2 * AmortizationCost; 487 } 488 489 if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) && 490 Src->getVectorNumElements() < 8) { 491 // We scalarize the loads/stores because there is not v.4b register and we 492 // have to promote the elements to v.4h. 493 unsigned NumVecElts = Src->getVectorNumElements(); 494 unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; 495 // We generate 2 instructions per vector element. 496 return NumVectorizableInstsToAmortize * NumVecElts * 2; 497 } 498 499 return LT.first; 500} 501