X86TargetTransformInfo.cpp revision 5f0d9dbdf48a9efe16bfadf88e5335f7b9a8ec3f
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// \file 10/// This file implements a TargetTransformInfo analysis pass specific to the 11/// X86 target machine. It uses the target's detailed information to provide 12/// more precise answers to certain TTI queries, while letting the target 13/// independent and default TTI implementations handle the rest. 14/// 15//===----------------------------------------------------------------------===// 16 17#define DEBUG_TYPE "x86tti" 18#include "X86.h" 19#include "X86TargetMachine.h" 20#include "llvm/Analysis/TargetTransformInfo.h" 21#include "llvm/Support/Debug.h" 22#include "llvm/Target/TargetLowering.h" 23#include "llvm/Target/CostTable.h" 24using namespace llvm; 25 26// Declare the pass initialization routine locally as target-specific passes 27// don't havve a target-wide initialization entry point, and so we rely on the 28// pass constructor initialization. 29namespace llvm { 30void initializeX86TTIPass(PassRegistry &); 31} 32 33namespace { 34 35class X86TTI : public ImmutablePass, public TargetTransformInfo { 36 const X86TargetMachine *TM; 37 const X86Subtarget *ST; 38 const X86TargetLowering *TLI; 39 40 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 41 /// are set if the result needs to be inserted and/or extracted from vectors. 42 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 43 44public: 45 X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { 46 llvm_unreachable("This pass cannot be directly constructed"); 47 } 48 49 X86TTI(const X86TargetMachine *TM) 50 : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), 51 TLI(TM->getTargetLowering()) { 52 initializeX86TTIPass(*PassRegistry::getPassRegistry()); 53 } 54 55 virtual void initializePass() { 56 pushTTIStack(this); 57 } 58 59 virtual void finalizePass() { 60 popTTIStack(); 61 } 62 63 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 64 TargetTransformInfo::getAnalysisUsage(AU); 65 } 66 67 /// Pass identification. 68 static char ID; 69 70 /// Provide necessary pointer adjustments for the two base classes. 71 virtual void *getAdjustedAnalysisPointer(const void *ID) { 72 if (ID == &TargetTransformInfo::ID) 73 return (TargetTransformInfo*)this; 74 return this; 75 } 76 77 /// \name Scalar TTI Implementations 78 /// @{ 79 virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const; 80 81 /// @} 82 83 /// \name Vector TTI Implementations 84 /// @{ 85 86 virtual unsigned getNumberOfRegisters(bool Vector) const; 87 virtual unsigned getRegisterBitWidth(bool Vector) const; 88 virtual unsigned getMaximumUnrollFactor() const; 89 virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; 90 virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 91 int Index, Type *SubTp) const; 92 virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 93 Type *Src) const; 94 virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 95 Type *CondTy) const; 96 virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 97 unsigned Index) const; 98 virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, 99 unsigned Alignment, 100 unsigned AddressSpace) const; 101 102 /// @} 103}; 104 105} // end anonymous namespace 106 107INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti", 108 "X86 Target Transform Info", true, true, false) 109char X86TTI::ID = 0; 110 111ImmutablePass * 112llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) { 113 return new X86TTI(TM); 114} 115 116 117//===----------------------------------------------------------------------===// 118// 119// X86 cost model. 120// 121//===----------------------------------------------------------------------===// 122 123X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { 124 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 125 // TODO: Currently the __builtin_popcount() implementation using SSE3 126 // instructions is inefficient. Once the problem is fixed, we should 127 // call ST->hasSSE3() instead of ST->hasSSE4(). 128 return ST->hasSSE41() ? PSK_FastHardware : PSK_Software; 129} 130 131unsigned X86TTI::getNumberOfRegisters(bool Vector) const { 132 if (Vector && !ST->hasSSE1()) 133 return 0; 134 135 if (ST->is64Bit()) 136 return 16; 137 return 8; 138} 139 140unsigned X86TTI::getRegisterBitWidth(bool Vector) const { 141 if (Vector) { 142 if (ST->hasAVX()) return 256; 143 if (ST->hasSSE1()) return 128; 144 return 0; 145 } 146 147 if (ST->is64Bit()) 148 return 64; 149 return 32; 150 151} 152 153unsigned X86TTI::getMaximumUnrollFactor() const { 154 if (ST->isAtom()) 155 return 1; 156 157 // Sandybridge and Haswell have multiple execution ports and pipelined 158 // vector units. 159 if (ST->hasAVX()) 160 return 4; 161 162 return 2; 163} 164 165unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { 166 // Legalize the type. 167 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 168 169 int ISD = TLI->InstructionOpcodeToISD(Opcode); 170 assert(ISD && "Invalid opcode"); 171 172 static const CostTblEntry<MVT> AVX1CostTable[] = { 173 // We don't have to scalarize unsupported ops. We can issue two half-sized 174 // operations and we only need to extract the upper YMM half. 175 // Two ops + 1 extract + 1 insert = 4. 176 { ISD::MUL, MVT::v8i32, 4 }, 177 { ISD::SUB, MVT::v8i32, 4 }, 178 { ISD::ADD, MVT::v8i32, 4 }, 179 { ISD::SUB, MVT::v4i64, 4 }, 180 { ISD::ADD, MVT::v4i64, 4 }, 181 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then 182 // are lowered as a series of long multiplies(3), shifts(4) and adds(2) 183 // Because we believe v4i64 to be a legal type, we must also include the 184 // split factor of two in the cost table. Therefore, the cost here is 18 185 // instead of 9. 186 { ISD::MUL, MVT::v4i64, 18 }, 187 }; 188 189 // Look for AVX1 lowering tricks. 190 if (ST->hasAVX() && !ST->hasAVX2()) { 191 int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable), 192 ISD, LT.second); 193 if (Idx != -1) 194 return LT.first * AVX1CostTable[Idx].Cost; 195 } 196 197 // Custom lowering of vectors. 198 static const CostTblEntry<MVT> CustomLowered[] = { 199 // A v2i64/v4i64 and multiply is custom lowered as a series of long 200 // multiplies(3), shifts(4) and adds(2). 201 { ISD::MUL, MVT::v2i64, 9 }, 202 { ISD::MUL, MVT::v4i64, 9 }, 203 }; 204 int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered), 205 ISD, LT.second); 206 if (Idx != -1) 207 return LT.first * CustomLowered[Idx].Cost; 208 209 // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, 210 // 2x pmuludq, 2x shuffle. 211 if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && 212 !ST->hasSSE41()) 213 return 6; 214 215 // Fallback to the default implementation. 216 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty); 217} 218 219unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 220 Type *SubTp) const { 221 // We only estimate the cost of reverse shuffles. 222 if (Kind != SK_Reverse) 223 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 224 225 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); 226 unsigned Cost = 1; 227 if (LT.second.getSizeInBits() > 128) 228 Cost = 3; // Extract + insert + copy. 229 230 // Multiple by the number of parts. 231 return Cost * LT.first; 232} 233 234unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 235 int ISD = TLI->InstructionOpcodeToISD(Opcode); 236 assert(ISD && "Invalid opcode"); 237 238 EVT SrcTy = TLI->getValueType(Src); 239 EVT DstTy = TLI->getValueType(Dst); 240 241 if (!SrcTy.isSimple() || !DstTy.isSimple()) 242 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 243 244 static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = { 245 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 246 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 247 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 248 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 249 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, 250 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 }, 251 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, 252 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, 253 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, 254 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, 255 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 }, 256 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 257 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 }, 258 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 }, 259 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 }, 260 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, 261 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 8 }, 262 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, 263 }; 264 265 if (ST->hasAVX()) { 266 int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl, 267 array_lengthof(AVXConversionTbl), 268 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); 269 if (Idx != -1) 270 return AVXConversionTbl[Idx].Cost; 271 } 272 273 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 274} 275 276unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 277 Type *CondTy) const { 278 // Legalize the type. 279 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 280 281 MVT MTy = LT.second; 282 283 int ISD = TLI->InstructionOpcodeToISD(Opcode); 284 assert(ISD && "Invalid opcode"); 285 286 static const CostTblEntry<MVT> SSE42CostTbl[] = { 287 { ISD::SETCC, MVT::v2f64, 1 }, 288 { ISD::SETCC, MVT::v4f32, 1 }, 289 { ISD::SETCC, MVT::v2i64, 1 }, 290 { ISD::SETCC, MVT::v4i32, 1 }, 291 { ISD::SETCC, MVT::v8i16, 1 }, 292 { ISD::SETCC, MVT::v16i8, 1 }, 293 }; 294 295 static const CostTblEntry<MVT> AVX1CostTbl[] = { 296 { ISD::SETCC, MVT::v4f64, 1 }, 297 { ISD::SETCC, MVT::v8f32, 1 }, 298 // AVX1 does not support 8-wide integer compare. 299 { ISD::SETCC, MVT::v4i64, 4 }, 300 { ISD::SETCC, MVT::v8i32, 4 }, 301 { ISD::SETCC, MVT::v16i16, 4 }, 302 { ISD::SETCC, MVT::v32i8, 4 }, 303 }; 304 305 static const CostTblEntry<MVT> AVX2CostTbl[] = { 306 { ISD::SETCC, MVT::v4i64, 1 }, 307 { ISD::SETCC, MVT::v8i32, 1 }, 308 { ISD::SETCC, MVT::v16i16, 1 }, 309 { ISD::SETCC, MVT::v32i8, 1 }, 310 }; 311 312 if (ST->hasAVX2()) { 313 int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy); 314 if (Idx != -1) 315 return LT.first * AVX2CostTbl[Idx].Cost; 316 } 317 318 if (ST->hasAVX()) { 319 int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy); 320 if (Idx != -1) 321 return LT.first * AVX1CostTbl[Idx].Cost; 322 } 323 324 if (ST->hasSSE42()) { 325 int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy); 326 if (Idx != -1) 327 return LT.first * SSE42CostTbl[Idx].Cost; 328 } 329 330 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 331} 332 333unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, 334 unsigned Index) const { 335 assert(Val->isVectorTy() && "This must be a vector type"); 336 337 if (Index != -1U) { 338 // Legalize the type. 339 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 340 341 // This type is legalized to a scalar type. 342 if (!LT.second.isVector()) 343 return 0; 344 345 // The type may be split. Normalize the index to the new type. 346 unsigned Width = LT.second.getVectorNumElements(); 347 Index = Index % Width; 348 349 // Floating point scalars are already located in index #0. 350 if (Val->getScalarType()->isFloatingPointTy() && Index == 0) 351 return 0; 352 } 353 354 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 355} 356 357unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 358 unsigned AddressSpace) const { 359 // Legalize the type. 360 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 361 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 362 "Invalid Opcode"); 363 364 // Each load/store unit costs 1. 365 unsigned Cost = LT.first * 1; 366 367 // On Sandybridge 256bit load/stores are double pumped 368 // (but not on Haswell). 369 if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) 370 Cost*=2; 371 372 return Cost; 373} 374