1//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "PPCTargetTransformInfo.h"
11#include "llvm/Analysis/TargetTransformInfo.h"
12#include "llvm/CodeGen/BasicTTIImpl.h"
13#include "llvm/Support/CommandLine.h"
14#include "llvm/Support/Debug.h"
15#include "llvm/Target/CostTable.h"
16#include "llvm/Target/TargetLowering.h"
17using namespace llvm;
18
19#define DEBUG_TYPE "ppctti"
20
21static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
22cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
23
24// This is currently only used for the data prefetch pass which is only enabled
25// for BG/Q by default.
26static cl::opt<unsigned>
27CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
28              cl::desc("The loop prefetch cache line size"));
29
30//===----------------------------------------------------------------------===//
31//
32// PPC cost model.
33//
34//===----------------------------------------------------------------------===//
35
36TargetTransformInfo::PopcntSupportKind
37PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
38  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
39  if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
40    return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
41             TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
42  return TTI::PSK_Software;
43}
44
45int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
46  if (DisablePPCConstHoist)
47    return BaseT::getIntImmCost(Imm, Ty);
48
49  assert(Ty->isIntegerTy());
50
51  unsigned BitSize = Ty->getPrimitiveSizeInBits();
52  if (BitSize == 0)
53    return ~0U;
54
55  if (Imm == 0)
56    return TTI::TCC_Free;
57
58  if (Imm.getBitWidth() <= 64) {
59    if (isInt<16>(Imm.getSExtValue()))
60      return TTI::TCC_Basic;
61
62    if (isInt<32>(Imm.getSExtValue())) {
63      // A constant that can be materialized using lis.
64      if ((Imm.getZExtValue() & 0xFFFF) == 0)
65        return TTI::TCC_Basic;
66
67      return 2 * TTI::TCC_Basic;
68    }
69  }
70
71  return 4 * TTI::TCC_Basic;
72}
73
74int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
75                              Type *Ty) {
76  if (DisablePPCConstHoist)
77    return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
78
79  assert(Ty->isIntegerTy());
80
81  unsigned BitSize = Ty->getPrimitiveSizeInBits();
82  if (BitSize == 0)
83    return ~0U;
84
85  switch (IID) {
86  default:
87    return TTI::TCC_Free;
88  case Intrinsic::sadd_with_overflow:
89  case Intrinsic::uadd_with_overflow:
90  case Intrinsic::ssub_with_overflow:
91  case Intrinsic::usub_with_overflow:
92    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
93      return TTI::TCC_Free;
94    break;
95  case Intrinsic::experimental_stackmap:
96    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
97      return TTI::TCC_Free;
98    break;
99  case Intrinsic::experimental_patchpoint_void:
100  case Intrinsic::experimental_patchpoint_i64:
101    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
102      return TTI::TCC_Free;
103    break;
104  }
105  return PPCTTIImpl::getIntImmCost(Imm, Ty);
106}
107
108int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
109                              Type *Ty) {
110  if (DisablePPCConstHoist)
111    return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
112
113  assert(Ty->isIntegerTy());
114
115  unsigned BitSize = Ty->getPrimitiveSizeInBits();
116  if (BitSize == 0)
117    return ~0U;
118
119  unsigned ImmIdx = ~0U;
120  bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
121       ZeroFree = false;
122  switch (Opcode) {
123  default:
124    return TTI::TCC_Free;
125  case Instruction::GetElementPtr:
126    // Always hoist the base address of a GetElementPtr. This prevents the
127    // creation of new constants for every base constant that gets constant
128    // folded with the offset.
129    if (Idx == 0)
130      return 2 * TTI::TCC_Basic;
131    return TTI::TCC_Free;
132  case Instruction::And:
133    RunFree = true; // (for the rotate-and-mask instructions)
134    // Fallthrough...
135  case Instruction::Add:
136  case Instruction::Or:
137  case Instruction::Xor:
138    ShiftedFree = true;
139    // Fallthrough...
140  case Instruction::Sub:
141  case Instruction::Mul:
142  case Instruction::Shl:
143  case Instruction::LShr:
144  case Instruction::AShr:
145    ImmIdx = 1;
146    break;
147  case Instruction::ICmp:
148    UnsignedFree = true;
149    ImmIdx = 1;
150    // Fallthrough... (zero comparisons can use record-form instructions)
151  case Instruction::Select:
152    ZeroFree = true;
153    break;
154  case Instruction::PHI:
155  case Instruction::Call:
156  case Instruction::Ret:
157  case Instruction::Load:
158  case Instruction::Store:
159    break;
160  }
161
162  if (ZeroFree && Imm == 0)
163    return TTI::TCC_Free;
164
165  if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
166    if (isInt<16>(Imm.getSExtValue()))
167      return TTI::TCC_Free;
168
169    if (RunFree) {
170      if (Imm.getBitWidth() <= 32 &&
171          (isShiftedMask_32(Imm.getZExtValue()) ||
172           isShiftedMask_32(~Imm.getZExtValue())))
173        return TTI::TCC_Free;
174
175      if (ST->isPPC64() &&
176          (isShiftedMask_64(Imm.getZExtValue()) ||
177           isShiftedMask_64(~Imm.getZExtValue())))
178        return TTI::TCC_Free;
179    }
180
181    if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
182      return TTI::TCC_Free;
183
184    if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
185      return TTI::TCC_Free;
186  }
187
188  return PPCTTIImpl::getIntImmCost(Imm, Ty);
189}
190
191void PPCTTIImpl::getUnrollingPreferences(Loop *L,
192                                         TTI::UnrollingPreferences &UP) {
193  if (ST->getDarwinDirective() == PPC::DIR_A2) {
194    // The A2 is in-order with a deep pipeline, and concatenation unrolling
195    // helps expose latency-hiding opportunities to the instruction scheduler.
196    UP.Partial = UP.Runtime = true;
197
198    // We unroll a lot on the A2 (hundreds of instructions), and the benefits
199    // often outweigh the cost of a division to compute the trip count.
200    UP.AllowExpensiveTripCount = true;
201  }
202
203  BaseT::getUnrollingPreferences(L, UP);
204}
205
206bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
207  // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
208  // on combining the loads generated for consecutive accesses, and failure to
209  // do so is particularly expensive. This makes it much more likely (compared
210  // to only using concatenation unrolling).
211  if (ST->getDarwinDirective() == PPC::DIR_A2)
212    return true;
213
214  return LoopHasReductions;
215}
216
217bool PPCTTIImpl::enableInterleavedAccessVectorization() {
218  return true;
219}
220
221unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
222  if (Vector && !ST->hasAltivec() && !ST->hasQPX())
223    return 0;
224  return ST->hasVSX() ? 64 : 32;
225}
226
227unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
228  if (Vector) {
229    if (ST->hasQPX()) return 256;
230    if (ST->hasAltivec()) return 128;
231    return 0;
232  }
233
234  if (ST->isPPC64())
235    return 64;
236  return 32;
237
238}
239
240unsigned PPCTTIImpl::getCacheLineSize() {
241  // This is currently only used for the data prefetch pass which is only
242  // enabled for BG/Q by default.
243  return CacheLineSize;
244}
245
246unsigned PPCTTIImpl::getPrefetchDistance() {
247  // This seems like a reasonable default for the BG/Q (this pass is enabled, by
248  // default, only on the BG/Q).
249  return 300;
250}
251
252unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
253  unsigned Directive = ST->getDarwinDirective();
254  // The 440 has no SIMD support, but floating-point instructions
255  // have a 5-cycle latency, so unroll by 5x for latency hiding.
256  if (Directive == PPC::DIR_440)
257    return 5;
258
259  // The A2 has no SIMD support, but floating-point instructions
260  // have a 6-cycle latency, so unroll by 6x for latency hiding.
261  if (Directive == PPC::DIR_A2)
262    return 6;
263
264  // FIXME: For lack of any better information, do no harm...
265  if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
266    return 1;
267
268  // For P7 and P8, floating-point instructions have a 6-cycle latency and
269  // there are two execution units, so unroll by 12x for latency hiding.
270  // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
271  if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
272      Directive == PPC::DIR_PWR9)
273    return 12;
274
275  // For most things, modern systems have two execution units (and
276  // out-of-order execution).
277  return 2;
278}
279
280int PPCTTIImpl::getArithmeticInstrCost(
281    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
282    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
283    TTI::OperandValueProperties Opd2PropInfo) {
284  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
285
286  // Fallback to the default implementation.
287  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
288                                       Opd1PropInfo, Opd2PropInfo);
289}
290
291int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
292                               Type *SubTp) {
293  // Legalize the type.
294  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
295
296  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
297  // (at least in the sense that there need only be one non-loop-invariant
298  // instruction). We need one such shuffle instruction for each actual
299  // register (this is not true for arbitrary shuffles, but is true for the
300  // structured types of shuffles covered by TTI::ShuffleKind).
301  return LT.first;
302}
303
304int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
305  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
306
307  return BaseT::getCastInstrCost(Opcode, Dst, Src);
308}
309
310int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
311  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
312}
313
314int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
315  assert(Val->isVectorTy() && "This must be a vector type");
316
317  int ISD = TLI->InstructionOpcodeToISD(Opcode);
318  assert(ISD && "Invalid opcode");
319
320  if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
321    // Double-precision scalars are already located in index #0.
322    if (Index == 0)
323      return 0;
324
325    return BaseT::getVectorInstrCost(Opcode, Val, Index);
326  } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
327    // Floating point scalars are already located in index #0.
328    if (Index == 0)
329      return 0;
330
331    return BaseT::getVectorInstrCost(Opcode, Val, Index);
332  }
333
334  // Estimated cost of a load-hit-store delay.  This was obtained
335  // experimentally as a minimum needed to prevent unprofitable
336  // vectorization for the paq8p benchmark.  It may need to be
337  // raised further if other unprofitable cases remain.
338  unsigned LHSPenalty = 2;
339  if (ISD == ISD::INSERT_VECTOR_ELT)
340    LHSPenalty += 7;
341
342  // Vector element insert/extract with Altivec is very expensive,
343  // because they require store and reload with the attendant
344  // processor stall for load-hit-store.  Until VSX is available,
345  // these need to be estimated as very costly.
346  if (ISD == ISD::EXTRACT_VECTOR_ELT ||
347      ISD == ISD::INSERT_VECTOR_ELT)
348    return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
349
350  return BaseT::getVectorInstrCost(Opcode, Val, Index);
351}
352
353int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
354                                unsigned AddressSpace) {
355  // Legalize the type.
356  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
357  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
358         "Invalid Opcode");
359
360  int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
361
362  // Aligned loads and stores are easy.
363  unsigned SrcBytes = LT.second.getStoreSize();
364  if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
365    return Cost;
366
367  bool IsAltivecType = ST->hasAltivec() &&
368                       (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
369                        LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
370  bool IsVSXType = ST->hasVSX() &&
371                   (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
372  bool IsQPXType = ST->hasQPX() &&
373                   (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
374
375  // If we can use the permutation-based load sequence, then this is also
376  // relatively cheap (not counting loop-invariant instructions): one load plus
377  // one permute (the last load in a series has extra cost, but we're
378  // neglecting that here). Note that on the P7, we could do unaligned loads
379  // for Altivec types using the VSX instructions, but that's more expensive
380  // than using the permutation-based load sequence. On the P8, that's no
381  // longer true.
382  if (Opcode == Instruction::Load &&
383      ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
384      Alignment >= LT.second.getScalarType().getStoreSize())
385    return Cost + LT.first; // Add the cost of the permutations.
386
387  // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
388  // P7, unaligned vector loads are more expensive than the permutation-based
389  // load sequence, so that might be used instead, but regardless, the net cost
390  // is about the same (not counting loop-invariant instructions).
391  if (IsVSXType || (ST->hasVSX() && IsAltivecType))
392    return Cost;
393
394  // PPC in general does not support unaligned loads and stores. They'll need
395  // to be decomposed based on the alignment factor.
396
397  // Add the cost of each scalar load or store.
398  Cost += LT.first*(SrcBytes/Alignment-1);
399
400  // For a vector type, there is also scalarization overhead (only for
401  // stores, loads are expanded using the vector-load + permutation sequence,
402  // which is much less expensive).
403  if (Src->isVectorTy() && Opcode == Instruction::Store)
404    for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
405      Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
406
407  return Cost;
408}
409
410int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
411                                           unsigned Factor,
412                                           ArrayRef<unsigned> Indices,
413                                           unsigned Alignment,
414                                           unsigned AddressSpace) {
415  assert(isa<VectorType>(VecTy) &&
416         "Expect a vector type for interleaved memory op");
417
418  // Legalize the type.
419  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
420
421  // Firstly, the cost of load/store operation.
422  int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
423
424  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
425  // (at least in the sense that there need only be one non-loop-invariant
426  // instruction). For each result vector, we need one shuffle per incoming
427  // vector (except that the first shuffle can take two incoming vectors
428  // because it does not need to take itself).
429  Cost += Factor*(LT.first-1);
430
431  return Cost;
432}
433
434