X86TargetTransformInfo.cpp revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This file implements a TargetTransformInfo analysis pass specific to the 11/// X86 target machine. It uses the target's detailed information to provide 12/// more precise answers to certain TTI queries, while letting the target 13/// independent and default TTI implementations handle the rest. 14/// 15//===----------------------------------------------------------------------===// 16 17#define DEBUG_TYPE "x86tti" 18#include "X86.h" 19#include "X86TargetMachine.h" 20#include "llvm/ADT/DepthFirstIterator.h" 21#include "llvm/Analysis/LoopInfo.h" 22#include "llvm/Analysis/TargetTransformInfo.h" 23#include "llvm/IR/IntrinsicInst.h" 24#include "llvm/Support/CommandLine.h" 25#include "llvm/Support/Debug.h" 26#include "llvm/Target/CostTable.h" 27#include "llvm/Target/TargetLowering.h" 28using namespace llvm; 29 30// Declare the pass initialization routine locally as target-specific passes 31// don't havve a target-wide initialization entry point, and so we rely on the 32// pass constructor initialization. 33namespace llvm { 34void initializeX86TTIPass(PassRegistry &); 35} 36 37static cl::opt<bool> 38UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true), 39 cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden); 40static cl::opt<unsigned> 41PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0), 42 cl::desc("Threshold for X86 partial unrolling"), cl::Hidden); 43static cl::opt<unsigned> 44PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2), 45 cl::desc("Threshold for taken branches in X86 partial unrolling"), 46 cl::Hidden); 47 48namespace { 49 50class X86TTI final : public ImmutablePass, public TargetTransformInfo { 51 const X86Subtarget *ST; 52 const X86TargetLowering *TLI; 53 54 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 55 /// are set if the result needs to be inserted and/or extracted from vectors. 56 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 57 58public: 59 X86TTI() : ImmutablePass(ID), ST(0), TLI(0) { 60 llvm_unreachable("This pass cannot be directly constructed"); 61 } 62 63 X86TTI(const X86TargetMachine *TM) 64 : ImmutablePass(ID), ST(TM->getSubtargetImpl()), 65 TLI(TM->getTargetLowering()) { 66 initializeX86TTIPass(*PassRegistry::getPassRegistry()); 67 } 68 69 void initializePass() override { 70 pushTTIStack(this); 71 } 72 73 void getAnalysisUsage(AnalysisUsage &AU) const override { 74 TargetTransformInfo::getAnalysisUsage(AU); 75 } 76 77 /// Pass identification. 78 static char ID; 79 80 /// Provide necessary pointer adjustments for the two base classes. 81 void *getAdjustedAnalysisPointer(const void *ID) override { 82 if (ID == &TargetTransformInfo::ID) 83 return (TargetTransformInfo*)this; 84 return this; 85 } 86 87 /// \name Scalar TTI Implementations 88 /// @{ 89 PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; 90 void getUnrollingPreferences(Loop *L, 91 UnrollingPreferences &UP) const override; 92 93 /// @} 94 95 /// \name Vector TTI Implementations 96 /// @{ 97 98 unsigned getNumberOfRegisters(bool Vector) const override; 99 unsigned getRegisterBitWidth(bool Vector) const override; 100 unsigned getMaximumUnrollFactor() const override; 101 unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, 102 OperandValueKind) const override; 103 unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 104 int Index, Type *SubTp) const override; 105 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 106 Type *Src) const override; 107 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 108 Type *CondTy) const override; 109 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 110 unsigned Index) const override; 111 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 112 unsigned AddressSpace) const override; 113 114 unsigned getAddressComputationCost(Type *PtrTy, 115 bool IsComplex) const override; 116 117 unsigned getReductionCost(unsigned Opcode, Type *Ty, 118 bool IsPairwiseForm) const override; 119 120 unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; 121 122 unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 123 Type *Ty) const override; 124 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 125 Type *Ty) const override; 126 127 /// @} 128}; 129 130} // end anonymous namespace 131 132INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti", 133 "X86 Target Transform Info", true, true, false) 134char X86TTI::ID = 0; 135 136ImmutablePass * 137llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) { 138 return new X86TTI(TM); 139} 140 141 142//===----------------------------------------------------------------------===// 143// 144// X86 cost model. 145// 146//===----------------------------------------------------------------------===// 147 148X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { 149 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 150 // TODO: Currently the __builtin_popcount() implementation using SSE3 151 // instructions is inefficient. Once the problem is fixed, we should 152 // call ST->hasSSE3() instead of ST->hasPOPCNT(). 153 return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; 154} 155 156void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { 157 if (!UsePartialUnrolling) 158 return; 159 // According to the Intel 64 and IA-32 Architectures Optimization Reference 160 // Manual, Intel Core models and later have a loop stream detector 161 // (and associated uop queue) that can benefit from partial unrolling. 162 // The relevant requirements are: 163 // - The loop must have no more than 4 (8 for Nehalem and later) branches 164 // taken, and none of them may be calls. 165 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 166 167 // According to the Software Optimization Guide for AMD Family 15h Processors, 168 // models 30h-4fh (Steamroller and later) have a loop predictor and loop 169 // buffer which can benefit from partial unrolling. 170 // The relevant requirements are: 171 // - The loop must have fewer than 16 branches 172 // - The loop must have less than 40 uops in all executed loop branches 173 174 unsigned MaxBranches, MaxOps; 175 if (PartialUnrollingThreshold.getNumOccurrences() > 0) { 176 MaxBranches = PartialUnrollingMaxBranches; 177 MaxOps = PartialUnrollingThreshold; 178 } else if (ST->isAtom()) { 179 // On the Atom, the throughput for taken branches is 2 cycles. For small 180 // simple loops, expand by a small factor to hide the backedge cost. 181 MaxBranches = 2; 182 MaxOps = 10; 183 } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) { 184 MaxBranches = 16; 185 MaxOps = 40; 186 } else if (ST->hasFMA4() /* Any other recent AMD */) { 187 return; 188 } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) { 189 MaxBranches = 8; 190 MaxOps = 28; 191 } else if (ST->hasSSSE3() /* Intel Core */) { 192 MaxBranches = 4; 193 MaxOps = 18; 194 } else { 195 return; 196 } 197 198 // Scan the loop: don't unroll loops with calls, and count the potential 199 // number of taken branches (this is somewhat conservative because we're 200 // counting all block transitions as potential branches while in reality some 201 // of these will become implicit via block placement). 202 unsigned MaxDepth = 0; 203 for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()), 204 DE = df_end(L->getHeader()); DI != DE;) { 205 if (!L->contains(*DI)) { 206 DI.skipChildren(); 207 continue; 208 } 209 210 MaxDepth = std::max(MaxDepth, DI.getPathLength()); 211 if (MaxDepth > MaxBranches) 212 return; 213 214 for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I) 215 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 216 ImmutableCallSite CS(I); 217 if (const Function *F = CS.getCalledFunction()) { 218 if (!isLoweredToCall(F)) 219 continue; 220 } 221 222 return; 223 } 224 225 ++DI; 226 } 227 228 // Enable runtime and partial unrolling up to the specified size. 229 UP.Partial = UP.Runtime = true; 230 UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; 231 232 // Set the maximum count based on the loop depth. The maximum number of 233 // branches taken in a loop (including the backedge) is equal to the maximum 234 // loop depth (the DFS path length from the loop header to any block in the 235 // loop). When the loop is unrolled, this depth (except for the backedge 236 // itself) is multiplied by the unrolling factor. This new unrolled depth 237 // must be less than the target-specific maximum branch count (which limits 238 // the number of taken branches in the uop buffer). 239 if (MaxDepth > 1) 240 UP.MaxCount = (MaxBranches-1)/(MaxDepth-1); 241} 242 243unsigned X86TTI::getNumberOfRegisters(bool Vector) const { 244 if (Vector && !ST->hasSSE1()) 245 return 0; 246 247 if (ST->is64Bit()) 248 return 16; 249 return 8; 250} 251 252unsigned X86TTI::getRegisterBitWidth(bool Vector) const { 253 if (Vector) { 254 if (ST->hasAVX()) return 256; 255 if (ST->hasSSE1()) return 128; 256 return 0; 257 } 258 259 if (ST->is64Bit()) 260 return 64; 261 return 32; 262 263} 264 265unsigned X86TTI::getMaximumUnrollFactor() const { 266 if (ST->isAtom()) 267 return 1; 268 269 // Sandybridge and Haswell have multiple execution ports and pipelined 270 // vector units. 271 if (ST->hasAVX()) 272 return 4; 273 274 return 2; 275} 276 277unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 278 OperandValueKind Op1Info, 279 OperandValueKind Op2Info) const { 280 // Legalize the type. 281 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 282 283 int ISD = TLI->InstructionOpcodeToISD(Opcode); 284 assert(ISD && "Invalid opcode"); 285 286 static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { 287 // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to 288 // customize them to detect the cases where shift amount is a scalar one. 289 { ISD::SHL, MVT::v4i32, 1 }, 290 { ISD::SRL, MVT::v4i32, 1 }, 291 { ISD::SRA, MVT::v4i32, 1 }, 292 { ISD::SHL, MVT::v8i32, 1 }, 293 { ISD::SRL, MVT::v8i32, 1 }, 294 { ISD::SRA, MVT::v8i32, 1 }, 295 { ISD::SHL, MVT::v2i64, 1 }, 296 { ISD::SRL, MVT::v2i64, 1 }, 297 { ISD::SHL, MVT::v4i64, 1 }, 298 { ISD::SRL, MVT::v4i64, 1 }, 299 300 { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. 301 { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized. 302 303 { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. 304 { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized. 305 306 { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. 307 { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized. 308 { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. 309 310 // Vectorizing division is a bad idea. See the SSE2 table for more comments. 311 { ISD::SDIV, MVT::v32i8, 32*20 }, 312 { ISD::SDIV, MVT::v16i16, 16*20 }, 313 { ISD::SDIV, MVT::v8i32, 8*20 }, 314 { ISD::SDIV, MVT::v4i64, 4*20 }, 315 { ISD::UDIV, MVT::v32i8, 32*20 }, 316 { ISD::UDIV, MVT::v16i16, 16*20 }, 317 { ISD::UDIV, MVT::v8i32, 8*20 }, 318 { ISD::UDIV, MVT::v4i64, 4*20 }, 319 }; 320 321 // Look for AVX2 lowering tricks. 322 if (ST->hasAVX2()) { 323 if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 324 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 325 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) 326 // On AVX2, a packed v16i16 shift left by a constant build_vector 327 // is lowered into a vector multiply (vpmullw). 328 return LT.first; 329 330 int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second); 331 if (Idx != -1) 332 return LT.first * AVX2CostTable[Idx].Cost; 333 } 334 335 static const CostTblEntry<MVT::SimpleValueType> 336 SSE2UniformConstCostTable[] = { 337 // We don't correctly identify costs of casts because they are marked as 338 // custom. 339 // Constant splats are cheaper for the following instructions. 340 { ISD::SHL, MVT::v16i8, 1 }, // psllw. 341 { ISD::SHL, MVT::v8i16, 1 }, // psllw. 342 { ISD::SHL, MVT::v4i32, 1 }, // pslld 343 { ISD::SHL, MVT::v2i64, 1 }, // psllq. 344 345 { ISD::SRL, MVT::v16i8, 1 }, // psrlw. 346 { ISD::SRL, MVT::v8i16, 1 }, // psrlw. 347 { ISD::SRL, MVT::v4i32, 1 }, // psrld. 348 { ISD::SRL, MVT::v2i64, 1 }, // psrlq. 349 350 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. 351 { ISD::SRA, MVT::v8i16, 1 }, // psraw. 352 { ISD::SRA, MVT::v4i32, 1 }, // psrad. 353 }; 354 355 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 356 ST->hasSSE2()) { 357 int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second); 358 if (Idx != -1) 359 return LT.first * SSE2UniformConstCostTable[Idx].Cost; 360 } 361 362 if (ISD == ISD::SHL && 363 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { 364 EVT VT = LT.second; 365 if ((VT == MVT::v8i16 && ST->hasSSE2()) || 366 (VT == MVT::v4i32 && ST->hasSSE41())) 367 // Vector shift left by non uniform constant can be lowered 368 // into vector multiply (pmullw/pmulld). 369 return LT.first; 370 if (VT == MVT::v4i32 && ST->hasSSE2()) 371 // A vector shift left by non uniform constant is converted 372 // into a vector multiply; the new multiply is eventually 373 // lowered into a sequence of shuffles and 2 x pmuludq. 374 ISD = ISD::MUL; 375 } 376 377 static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = { 378 // We don't correctly identify costs of casts because they are marked as 379 // custom. 380 // For some cases, where the shift amount is a scalar we would be able 381 // to generate better code. Unfortunately, when this is the case the value 382 // (the splat) will get hoisted out of the loop, thereby making it invisible 383 // to ISel. The cost model must return worst case assumptions because it is 384 // used for vectorization and we don't want to make vectorized code worse 385 // than scalar code. 386 { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence. 387 { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. 388 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. 389 { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. 390 { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. 391 392 { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. 393 { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. 394 { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized. 395 { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. 396 397 { ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized. 398 { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized. 399 { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. 400 { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. 401 402 // It is not a good idea to vectorize division. We have to scalarize it and 403 // in the process we will often end up having to spilling regular 404 // registers. The overhead of division is going to dominate most kernels 405 // anyways so try hard to prevent vectorization of division - it is 406 // generally a bad idea. Assume somewhat arbitrarily that we have to be able 407 // to hide "20 cycles" for each lane. 408 { ISD::SDIV, MVT::v16i8, 16*20 }, 409 { ISD::SDIV, MVT::v8i16, 8*20 }, 410 { ISD::SDIV, MVT::v4i32, 4*20 }, 411 { ISD::SDIV, MVT::v2i64, 2*20 }, 412 { ISD::UDIV, MVT::v16i8, 16*20 }, 413 { ISD::UDIV, MVT::v8i16, 8*20 }, 414 { ISD::UDIV, MVT::v4i32, 4*20 }, 415 { ISD::UDIV, MVT::v2i64, 2*20 }, 416 }; 417 418 if (ST->hasSSE2()) { 419 int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second); 420 if (Idx != -1) 421 return LT.first * SSE2CostTable[Idx].Cost; 422 } 423 424 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = { 425 // We don't have to scalarize unsupported ops. We can issue two half-sized 426 // operations and we only need to extract the upper YMM half. 427 // Two ops + 1 extract + 1 insert = 4. 428 { ISD::MUL, MVT::v16i16, 4 }, 429 { ISD::MUL, MVT::v8i32, 4 }, 430 { ISD::SUB, MVT::v8i32, 4 }, 431 { ISD::ADD, MVT::v8i32, 4 }, 432 { ISD::SUB, MVT::v4i64, 4 }, 433 { ISD::ADD, MVT::v4i64, 4 }, 434 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then 435 // are lowered as a series of long multiplies(3), shifts(4) and adds(2) 436 // Because we believe v4i64 to be a legal type, we must also include the 437 // split factor of two in the cost table. Therefore, the cost here is 18 438 // instead of 9. 439 { ISD::MUL, MVT::v4i64, 18 }, 440 }; 441 442 // Look for AVX1 lowering tricks. 443 if (ST->hasAVX() && !ST->hasAVX2()) { 444 EVT VT = LT.second; 445 446 // v16i16 and v8i32 shifts by non-uniform constants are lowered into a 447 // sequence of extract + two vector multiply + insert. 448 if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) && 449 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) 450 ISD = ISD::MUL; 451 452 int Idx = CostTableLookup(AVX1CostTable, ISD, VT); 453 if (Idx != -1) 454 return LT.first * AVX1CostTable[Idx].Cost; 455 } 456 457 // Custom lowering of vectors. 458 static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = { 459 // A v2i64/v4i64 and multiply is custom lowered as a series of long 460 // multiplies(3), shifts(4) and adds(2). 461 { ISD::MUL, MVT::v2i64, 9 }, 462 { ISD::MUL, MVT::v4i64, 9 }, 463 }; 464 int Idx = CostTableLookup(CustomLowered, ISD, LT.second); 465 if (Idx != -1) 466 return LT.first * CustomLowered[Idx].Cost; 467 468 // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, 469 // 2x pmuludq, 2x shuffle. 470 if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && 471 !ST->hasSSE41()) 472 return LT.first * 6; 473 474 // Fallback to the default implementation. 475 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, 476 Op2Info); 477} 478 479unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 480 Type *SubTp) const { 481 // We only estimate the cost of reverse shuffles. 482 if (Kind != SK_Reverse) 483 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 484 485 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); 486 unsigned Cost = 1; 487 if (LT.second.getSizeInBits() > 128) 488 Cost = 3; // Extract + insert + copy. 489 490 // Multiple by the number of parts. 491 return Cost * LT.first; 492} 493 494unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 495 int ISD = TLI->InstructionOpcodeToISD(Opcode); 496 assert(ISD && "Invalid opcode"); 497 498 std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src); 499 std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst); 500 501 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 502 SSE2ConvTbl[] = { 503 // These are somewhat magic numbers justified by looking at the output of 504 // Intel's IACA, running some kernels and making sure when we take 505 // legalization into account the throughput will be overestimated. 506 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 507 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 508 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 509 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 510 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 511 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 512 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 513 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 514 // There are faster sequences for float conversions. 515 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 516 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, 517 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 518 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 519 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 520 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, 521 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 522 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 523 }; 524 525 if (ST->hasSSE2() && !ST->hasAVX()) { 526 int Idx = 527 ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second); 528 if (Idx != -1) 529 return LTSrc.first * SSE2ConvTbl[Idx].Cost; 530 } 531 532 EVT SrcTy = TLI->getValueType(Src); 533 EVT DstTy = TLI->getValueType(Dst); 534 535 // The function getSimpleVT only handles simple value types. 536 if (!SrcTy.isSimple() || !DstTy.isSimple()) 537 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 538 539 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 540 AVX2ConversionTbl[] = { 541 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 542 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 543 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 544 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 545 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 546 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 547 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 548 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 549 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 550 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 551 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 552 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 553 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 554 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 555 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 556 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 557 558 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, 559 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, 560 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, 561 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, 562 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, 563 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, 564 }; 565 566 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 567 AVXConversionTbl[] = { 568 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 569 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 570 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, 571 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, 572 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, 573 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, 574 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 575 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 576 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, 577 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, 578 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, 579 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, 580 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, 581 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 582 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 583 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 584 585 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, 586 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, 587 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, 588 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, 589 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 590 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, 591 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, 592 593 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, 594 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, 595 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 596 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 597 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 598 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 599 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, 600 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 601 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, 602 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, 603 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, 604 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 605 606 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, 607 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, 608 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 609 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, 610 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, 611 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, 612 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 613 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, 614 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, 615 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 616 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 617 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, 618 // The generic code to compute the scalar overhead is currently broken. 619 // Workaround this limitation by estimating the scalarization overhead 620 // here. We have roughly 10 instructions per scalar element. 621 // Multiply that by the vector width. 622 // FIXME: remove that when PR19268 is fixed. 623 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 624 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 }, 625 626 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, 627 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 628 // This node is expanded into scalarized operations but BasicTTI is overly 629 // optimistic estimating its cost. It computes 3 per element (one 630 // vector-extract, one scalar conversion and one vector-insert). The 631 // problem is that the inserts form a read-modify-write chain so latency 632 // should be factored in too. Inflating the cost per element by 1. 633 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, 634 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, 635 }; 636 637 if (ST->hasAVX2()) { 638 int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 639 DstTy.getSimpleVT(), SrcTy.getSimpleVT()); 640 if (Idx != -1) 641 return AVX2ConversionTbl[Idx].Cost; 642 } 643 644 if (ST->hasAVX()) { 645 int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(), 646 SrcTy.getSimpleVT()); 647 if (Idx != -1) 648 return AVXConversionTbl[Idx].Cost; 649 } 650 651 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 652} 653 654unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 655 Type *CondTy) const { 656 // Legalize the type. 657 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 658 659 MVT MTy = LT.second; 660 661 int ISD = TLI->InstructionOpcodeToISD(Opcode); 662 assert(ISD && "Invalid opcode"); 663 664 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = { 665 { ISD::SETCC, MVT::v2f64, 1 }, 666 { ISD::SETCC, MVT::v4f32, 1 }, 667 { ISD::SETCC, MVT::v2i64, 1 }, 668 { ISD::SETCC, MVT::v4i32, 1 }, 669 { ISD::SETCC, MVT::v8i16, 1 }, 670 { ISD::SETCC, MVT::v16i8, 1 }, 671 }; 672 673 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = { 674 { ISD::SETCC, MVT::v4f64, 1 }, 675 { ISD::SETCC, MVT::v8f32, 1 }, 676 // AVX1 does not support 8-wide integer compare. 677 { ISD::SETCC, MVT::v4i64, 4 }, 678 { ISD::SETCC, MVT::v8i32, 4 }, 679 { ISD::SETCC, MVT::v16i16, 4 }, 680 { ISD::SETCC, MVT::v32i8, 4 }, 681 }; 682 683 static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = { 684 { ISD::SETCC, MVT::v4i64, 1 }, 685 { ISD::SETCC, MVT::v8i32, 1 }, 686 { ISD::SETCC, MVT::v16i16, 1 }, 687 { ISD::SETCC, MVT::v32i8, 1 }, 688 }; 689 690 if (ST->hasAVX2()) { 691 int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy); 692 if (Idx != -1) 693 return LT.first * AVX2CostTbl[Idx].Cost; 694 } 695 696 if (ST->hasAVX()) { 697 int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy); 698 if (Idx != -1) 699 return LT.first * AVX1CostTbl[Idx].Cost; 700 } 701 702 if (ST->hasSSE42()) { 703 int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy); 704 if (Idx != -1) 705 return LT.first * SSE42CostTbl[Idx].Cost; 706 } 707 708 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 709} 710 711unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, 712 unsigned Index) const { 713 assert(Val->isVectorTy() && "This must be a vector type"); 714 715 if (Index != -1U) { 716 // Legalize the type. 717 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 718 719 // This type is legalized to a scalar type. 720 if (!LT.second.isVector()) 721 return 0; 722 723 // The type may be split. Normalize the index to the new type. 724 unsigned Width = LT.second.getVectorNumElements(); 725 Index = Index % Width; 726 727 // Floating point scalars are already located in index #0. 728 if (Val->getScalarType()->isFloatingPointTy() && Index == 0) 729 return 0; 730 } 731 732 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 733} 734 735unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert, 736 bool Extract) const { 737 assert (Ty->isVectorTy() && "Can only scalarize vectors"); 738 unsigned Cost = 0; 739 740 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 741 if (Insert) 742 Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 743 if (Extract) 744 Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 745 } 746 747 return Cost; 748} 749 750unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 751 unsigned AddressSpace) const { 752 // Handle non-power-of-two vectors such as <3 x float> 753 if (VectorType *VTy = dyn_cast<VectorType>(Src)) { 754 unsigned NumElem = VTy->getVectorNumElements(); 755 756 // Handle a few common cases: 757 // <3 x float> 758 if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) 759 // Cost = 64 bit store + extract + 32 bit store. 760 return 3; 761 762 // <3 x double> 763 if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) 764 // Cost = 128 bit store + unpack + 64 bit store. 765 return 3; 766 767 // Assume that all other non-power-of-two numbers are scalarized. 768 if (!isPowerOf2_32(NumElem)) { 769 unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode, 770 VTy->getScalarType(), 771 Alignment, 772 AddressSpace); 773 unsigned SplitCost = getScalarizationOverhead(Src, 774 Opcode == Instruction::Load, 775 Opcode==Instruction::Store); 776 return NumElem * Cost + SplitCost; 777 } 778 } 779 780 // Legalize the type. 781 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 782 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 783 "Invalid Opcode"); 784 785 // Each load/store unit costs 1. 786 unsigned Cost = LT.first * 1; 787 788 // On Sandybridge 256bit load/stores are double pumped 789 // (but not on Haswell). 790 if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) 791 Cost*=2; 792 793 return Cost; 794} 795 796unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { 797 // Address computations in vectorized code with non-consecutive addresses will 798 // likely result in more instructions compared to scalar code where the 799 // computation can more often be merged into the index mode. The resulting 800 // extra micro-ops can significantly decrease throughput. 801 unsigned NumVectorInstToHideOverhead = 10; 802 803 if (Ty->isVectorTy() && IsComplex) 804 return NumVectorInstToHideOverhead; 805 806 return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex); 807} 808 809unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, 810 bool IsPairwise) const { 811 812 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 813 814 MVT MTy = LT.second; 815 816 int ISD = TLI->InstructionOpcodeToISD(Opcode); 817 assert(ISD && "Invalid opcode"); 818 819 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 820 // and make it as the cost. 821 822 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = { 823 { ISD::FADD, MVT::v2f64, 2 }, 824 { ISD::FADD, MVT::v4f32, 4 }, 825 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 826 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 827 { ISD::ADD, MVT::v8i16, 5 }, 828 }; 829 830 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = { 831 { ISD::FADD, MVT::v4f32, 4 }, 832 { ISD::FADD, MVT::v4f64, 5 }, 833 { ISD::FADD, MVT::v8f32, 7 }, 834 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 835 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 836 { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". 837 { ISD::ADD, MVT::v8i16, 5 }, 838 { ISD::ADD, MVT::v8i32, 5 }, 839 }; 840 841 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = { 842 { ISD::FADD, MVT::v2f64, 2 }, 843 { ISD::FADD, MVT::v4f32, 4 }, 844 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 845 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 846 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 847 }; 848 849 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = { 850 { ISD::FADD, MVT::v4f32, 3 }, 851 { ISD::FADD, MVT::v4f64, 3 }, 852 { ISD::FADD, MVT::v8f32, 4 }, 853 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 854 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". 855 { ISD::ADD, MVT::v4i64, 3 }, 856 { ISD::ADD, MVT::v8i16, 4 }, 857 { ISD::ADD, MVT::v8i32, 5 }, 858 }; 859 860 if (IsPairwise) { 861 if (ST->hasAVX()) { 862 int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy); 863 if (Idx != -1) 864 return LT.first * AVX1CostTblPairWise[Idx].Cost; 865 } 866 867 if (ST->hasSSE42()) { 868 int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy); 869 if (Idx != -1) 870 return LT.first * SSE42CostTblPairWise[Idx].Cost; 871 } 872 } else { 873 if (ST->hasAVX()) { 874 int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy); 875 if (Idx != -1) 876 return LT.first * AVX1CostTblNoPairWise[Idx].Cost; 877 } 878 879 if (ST->hasSSE42()) { 880 int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy); 881 if (Idx != -1) 882 return LT.first * SSE42CostTblNoPairWise[Idx].Cost; 883 } 884 } 885 886 return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise); 887} 888 889unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { 890 assert(Ty->isIntegerTy()); 891 892 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 893 if (BitSize == 0) 894 return ~0U; 895 896 if (Imm == 0) 897 return TCC_Free; 898 899 if (Imm.getBitWidth() <= 64 && 900 (isInt<32>(Imm.getSExtValue()) || isUInt<32>(Imm.getZExtValue()))) 901 return TCC_Basic; 902 else 903 return 2 * TCC_Basic; 904} 905 906unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 907 Type *Ty) const { 908 assert(Ty->isIntegerTy()); 909 910 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 911 if (BitSize == 0) 912 return ~0U; 913 914 unsigned ImmIdx = ~0U; 915 switch (Opcode) { 916 default: return TCC_Free; 917 case Instruction::GetElementPtr: 918 // Always hoist the base address of a GetElementPtr. This prevents the 919 // creation of new constants for every base constant that gets constant 920 // folded with the offset. 921 if (Idx == 0) 922 return 2 * TCC_Basic; 923 return TCC_Free; 924 case Instruction::Store: 925 ImmIdx = 0; 926 break; 927 case Instruction::Add: 928 case Instruction::Sub: 929 case Instruction::Mul: 930 case Instruction::UDiv: 931 case Instruction::SDiv: 932 case Instruction::URem: 933 case Instruction::SRem: 934 case Instruction::Shl: 935 case Instruction::LShr: 936 case Instruction::AShr: 937 case Instruction::And: 938 case Instruction::Or: 939 case Instruction::Xor: 940 case Instruction::ICmp: 941 ImmIdx = 1; 942 break; 943 case Instruction::Trunc: 944 case Instruction::ZExt: 945 case Instruction::SExt: 946 case Instruction::IntToPtr: 947 case Instruction::PtrToInt: 948 case Instruction::BitCast: 949 case Instruction::PHI: 950 case Instruction::Call: 951 case Instruction::Select: 952 case Instruction::Ret: 953 case Instruction::Load: 954 break; 955 } 956 957 if ((Idx == ImmIdx) && 958 Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) 959 return TCC_Free; 960 961 return X86TTI::getIntImmCost(Imm, Ty); 962} 963 964unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 965 const APInt &Imm, Type *Ty) const { 966 assert(Ty->isIntegerTy()); 967 968 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 969 if (BitSize == 0) 970 return ~0U; 971 972 switch (IID) { 973 default: return TCC_Free; 974 case Intrinsic::sadd_with_overflow: 975 case Intrinsic::uadd_with_overflow: 976 case Intrinsic::ssub_with_overflow: 977 case Intrinsic::usub_with_overflow: 978 case Intrinsic::smul_with_overflow: 979 case Intrinsic::umul_with_overflow: 980 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) 981 return TCC_Free; 982 break; 983 case Intrinsic::experimental_stackmap: 984 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 985 return TCC_Free; 986 break; 987 case Intrinsic::experimental_patchpoint_void: 988 case Intrinsic::experimental_patchpoint_i64: 989 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 990 return TCC_Free; 991 break; 992 } 993 return X86TTI::getIntImmCost(Imm, Ty); 994} 995